diff --git a/.github/actions/setup-guest-toolchain/action.yml b/.github/actions/setup-guest-toolchain/action.yml index 80f5712..972153d 100644 --- a/.github/actions/setup-guest-toolchain/action.yml +++ b/.github/actions/setup-guest-toolchain/action.yml @@ -28,6 +28,10 @@ inputs: description: "Foundry version tag" required: false default: "v1.4.3" + rust-toolchain: + description: "Rust toolchain version or channel" + required: false + default: "1.95.0" rust-components: description: "Extra rustup components (comma-separated)" required: false @@ -109,6 +113,7 @@ runs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: + toolchain: ${{ inputs.rust-toolchain }} components: ${{ inputs.rust-components }} - name: Cache Rust artifacts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f203d8b..65d9ecc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,7 @@ on: pull_request: env: + RUST_TOOLCHAIN: "1.95.0" XGENEXT2FS_VERSION: v1.5.6 XGENEXT2FS_SHA256_AMD64: 996e4e68a638b5dc5967d3410f92ecb8d2f41e32218bbe0f8b4c4474d7eebc59 XGENEXT2FS_SHA256_ARM64: e5aca81164b762bbe5447bacef41e4fa9e357fd9c8f44e519c5206227d43144d @@ -25,6 +26,8 @@ jobs: run: | sudo apt-get update sudo apt-get install -y \ + faketime \ + libfaketime \ lua5.4 \ liblua5.4-dev \ libslirp-dev @@ -32,6 +35,7 @@ jobs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: + toolchain: ${{ env.RUST_TOOLCHAIN }} components: rustfmt, clippy - name: Cache Rust artifacts @@ -53,7 +57,7 @@ jobs: - name: Test timeout-minutes: 15 - run: RUN_ANVIL_TESTS=1 cargo test --workspace --all-targets --all-features --locked + run: cargo test --workspace --all-targets --all-features --locked canonical-guest: runs-on: ubuntu-latest @@ -67,6 +71,7 @@ jobs: - name: Setup guest toolchain uses: ./.github/actions/setup-guest-toolchain with: + rust-toolchain: ${{ env.RUST_TOOLCHAIN }} xgenext2fs-version: ${{ env.XGENEXT2FS_VERSION }} xgenext2fs-sha256-amd64: ${{ env.XGENEXT2FS_SHA256_AMD64 }} xgenext2fs-sha256-arm64: ${{ env.XGENEXT2FS_SHA256_ARM64 }} @@ -92,6 +97,7 @@ jobs: - name: Setup guest toolchain uses: ./.github/actions/setup-guest-toolchain with: + rust-toolchain: ${{ env.RUST_TOOLCHAIN }} xgenext2fs-version: ${{ env.XGENEXT2FS_VERSION }} xgenext2fs-sha256-amd64: ${{ env.XGENEXT2FS_SHA256_AMD64 }} xgenext2fs-sha256-arm64: ${{ env.XGENEXT2FS_SHA256_ARM64 }} @@ -100,5 +106,10 @@ jobs: cartesi-machine-sha256-arm64: ${{ env.CARTESI_MACHINE_SHA256_ARM64 }} install-foundry: "true" + - name: Install faketime + run: | + sudo apt-get update + sudo apt-get install -y faketime libfaketime + - name: Run rollups E2E tests run: just test-rollups-e2e diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a561bcf..7d23ac3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,6 +20,7 @@ permissions: contents: write env: + RUST_TOOLCHAIN: "1.95.0" XGENEXT2FS_VERSION: v1.5.6 XGENEXT2FS_SHA256_AMD64: 996e4e68a638b5dc5967d3410f92ecb8d2f41e32218bbe0f8b4c4474d7eebc59 XGENEXT2FS_SHA256_ARM64: e5aca81164b762bbe5447bacef41e4fa9e357fd9c8f44e519c5206227d43144d @@ -59,6 +60,7 @@ jobs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: + toolchain: ${{ env.RUST_TOOLCHAIN }} targets: ${{ matrix.target }} - name: Cache Rust artifacts @@ -123,6 +125,7 @@ jobs: - name: Setup guest toolchain uses: ./.github/actions/setup-guest-toolchain with: + rust-toolchain: ${{ env.RUST_TOOLCHAIN }} xgenext2fs-version: ${{ env.XGENEXT2FS_VERSION }} xgenext2fs-sha256-amd64: ${{ env.XGENEXT2FS_SHA256_AMD64 }} xgenext2fs-sha256-arm64: ${{ env.XGENEXT2FS_SHA256_ARM64 }} diff --git a/.gitignore b/.gitignore index 822d909..0359111 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ sequencer.db-wal /out/ /.DS_Store soljson-latest.js +**/states/ diff --git a/AGENTS.md b/AGENTS.md index 2c468e0..3b98ec6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,110 +1,284 @@ # AGENTS.md -This file tells AI coding agents how to work effectively in this repository. +This file tells AI coding agents and human contributors how to work effectively in this repository. Start here. ## Mission -Build and evolve a **sequencer prototype** for a future DeFi stack. +Build and evolve a **DeFi sequencer** — the off-chain component that gives users low-latency soft confirmations while preserving the on-chain scheduler's canonical authority. -Current scope is intentionally small: a **dummy wallet app** that supports: -- `Transfer` -- `Withdrawal` +This is **security-critical infrastructure**. Treat every change with the care that financial systems demand. Correctness, determinism, and safety come before features. -Primary objective in this phase: make sequencer behavior, safety checks, and persistence reliable before adding "real world" execution logic. +The current application (`examples/app-core/`) is a **hardcoded placeholder** (deposit, transfer, withdrawal). It will be replaced by a production DeFi application. The sequencer itself is the product; the app is a stand-in for development. -## Project Snapshot +## Requirements -- Language: Rust (`edition = 2024`) -- API: Axum -- Queueing: Tokio MPSC -- Commit path: single blocking inclusion lane (hot path) -- Storage: SQLite (`rusqlite`, WAL mode) -- Signing: EIP-712 (`alloy`) -- Method payload encoding: SSZ +In order of importance: -## Glossary +1. **Low latency** — `POST /tx` ack under 500 ms. +2. **Financially sustainable** — the system must pay for itself through fees. +3. **Low cost transactions** — cheaper than native L1. -- `chunk`: small bounded list of user ops processed/executed and persisted together to amortize SQLite cost and keep low-latency ack behavior. -- `frame`: canonical ordering boundary that commits a `safe_block` plus a list of user ops; canonical execution drains all direct inputs safe at that block before executing the frame’s user ops. -- `batch`: list of frames that will be posted on-chain as one unit. -- `inclusion lane`: the hot-path single-lane loop that dequeues user ops, executes app logic, persists ordering, and rotates frame/batch boundaries. +## Invariants + +- **Dispute compatibility** — the design already accounts for rollup dispute resolution. Preserve it. +- **Wallet-compatible signing** — users sign with standard wallets via EIP-712. Never introduce custom signing schemes. +- **Deposit availability < 10 minutes** — happy path. The censorship-resistance backstop (`MAX_WAIT_BLOCKS`, ~4h) is the worst case. + +## Design Principles + +- **App-specific sequencer.** The sequencer may link against the application, enabling validation and execution at ingress time. This is a deliberate design choice. +- **Soft confirmations may be invalidated.** Under adversarial conditions (network, infrastructure, provider, or L1 outages), soft confirmations can be rolled back via recovery. This is by design, not a bug — it is what makes the sequencer sound in the face of liveness failures. +- **App UX may depend on the sequencer.** Without the sequencer, user experience may degrade substantially. This is an acceptable tradeoff: the on-chain scheduler remains the canonical source of truth; the sequencer only accelerates the UX. + +## Sequencer / Scheduler Duality + +The system has two components in an asymmetric relationship: + +### Scheduler — on-chain canonical authority + +The scheduler runs inside the rollup and **defines the canonical transaction ordering**. For each batch read from L1 safe inputs, it processes frames in order: drain all pending direct inputs whose block number is ≤ `safe_block`, then execute the frame's user ops. **The scheduler treats the sequencer as potentially Byzantine** — it enforces ordering and staleness rules regardless of what the sequencer claims. + +### Sequencer — off-chain predictor + +The sequencer knows the scheduler's algorithm. It uses that knowledge to **predict** what the canonical ordering will be once its batches land on L1, and issues soft confirmations to users ahead of time. The sequencer has **write priority on the execution queue**: as long as it keeps advancing `safe_block` and submitting batches, it controls ordering. + +### The `safe_block` synchronization primitive + +Each frame carries a `safe_block` chosen by the sequencer. It serves two purposes: + +- It tells the scheduler how far to drain direct inputs before executing the frame's user ops. +- It is the sequencer's commitment that it has accounted for all direct inputs up to that block. + +The sequencer must advance `safe_block` honestly. If it freezes `safe_block` (to censor deposits) or stops submitting batches, the staleness mechanism detects this and forces recovery. + +### When soft confirmations match canonical order + +Under honest sequencer operation and no infrastructure outages, soft confirmations match the canonical order. This is an **optimistic guarantee** — the sequencer is predicting a future the scheduler has not yet computed. When the sequencer goes offline, submits stale batches, or tries to censor direct inputs, the scheduler's force-drain backstop kicks in and the affected soft confirmations become invalid. + +## Batch Staleness and Recovery + +### Staleness + +A batch is **stale** when `inclusion_block - first_frame.safe_block >= MAX_WAIT_BLOCKS` (1200 blocks, ~4h). Staleness catches two failure modes: + +1. **Liveness failure** — the sequencer went offline and failed to submit batches in time. +2. **Censorship** — the sequencer kept submitting batches but froze `safe_block` to hold back direct inputs. + +When the scheduler encounters a stale batch, it **skips it entirely** — no nonce consumed, no state change. This is the **censorship-resistance backstop**: the sequencer cannot hold write priority indefinitely without advancing the drain cursor. Direct inputs are force-drained at `MAX_WAIT_BLOCKS`, guaranteeing deposit availability within ~4h even under adversarial conditions. + +### Cascading invalidation + +If a batch is stale, all existing subsequent batches are also invalid. The scheduler's expected-nonce counter does not advance on a stale skip, so every subsequent batch arrives at an unexpected nonce and is rejected. Invalidation is a suffix operation: marking batch `N` invalid cascades to `N+1`, `N+2`, …, including the open batch. New batches created after recovery are unaffected. + +### Preemptive recovery + +Rather than waiting for a batch to go stale on L1, the sequencer uses a **danger threshold** (`MAX_WAIT_BLOCKS − MARGIN`). The threshold is *only a trigger*: it tells the system "stop running, hand off to recovery." It does not encode "this batch is doomed" — that decision belongs to the post-flush cascade. + +The cycle crosses a process boundary by design: + +1. **Detector trips + process exits** — the in-process [`DangerDetector`](sequencer/src/recovery/detector.rs) polls `Storage::check_danger` on a cadence. When the L1-view-stale, observed closed-batch, observed Tip, or batch-relative wall-clock arm fires, the detector exits with `DetectorExit::RecoveryRequired`, the runtime maps that to `RunError::DangerDetected`, and the process exits with a non-zero status. Stopping the process is how the sequencer goes offline: no more user-op acceptance, no more batch submission. +2. **Orchestrator respawns** — systemd/k8s/etc. restarts the process. +3. **Startup syncs and dispatches** — the fresh process syncs the L1 safe head if reachable, re-runs `Storage::check_danger`, then [`decide_startup_action`](sequencer/src/recovery/mod.rs) chooses the startup path. +4. **Startup runs recovery** — dispatched by the danger status: + - **`RecoverTip`** → [`Storage::recover_aging_tip(danger_threshold)`](sequencer/src/storage/recovery.rs): no flush ran. The open Tip has no L1 footprint, so invalidate it directly once its first frame has aged past the danger threshold. + - **`FlushAndCascade`** → [`MempoolFlusher`](sequencer/src/recovery/flusher.rs) consumes pending wallet-nonce slots, startup re-syncs L1, then [`Storage::recover_post_flush(danger_threshold)`](sequencer/src/storage/recovery.rs) cascades from the first non-gold closed batch (every non-gold batch past the post-flush gold frontier is doomed — Silver-stale, Silver-poisoned, or no-op'd Pending). If all closed batches landed gold, fall through to a Tip check against `danger_threshold` (handles the corner case where `S_tip = S_closed`, the closed batch lands fresh, and the Tip's age clears the danger zone after the flush wait). + - **`Proceed`** → [`Storage::recover_aging_tip(danger_threshold)`](sequencer/src/storage/recovery.rs): no flush ran and no danger was detected. Closed batches past gold may still be in their natural lifecycle, so leave them alone; the Tip check is defensive and normally a no-op. + - **`Refuse`** → startup stops and surfaces the reason to the operator. Refusal is used when the L1 safe block timestamp is missing/too old, or when batch-relative wall-clock estimation says unresolved work has consumed its remaining runway without observed safe-state support for recovery. +5. **Normal operation resumes** — the lane, submitter, input reader, and a fresh detector all start up. + +See [`docs/recovery/README.md`](docs/recovery/README.md) Step 5 for the "everything past gold is doomed" mental model and why the post-flush cascade is unconditional rather than threshold-based. + +### Detection: safe-only, with wall-clock fallback + +Staleness is only checked against L1 **safe** state, never latest. Stale batches in latest that haven't reached safe yet will eventually become safe, and the check will fire at that point. This avoids reacting to L1 reorgs. + +When the sequencer's view of L1 stops advancing — most often because the RPC gateway is stalled or returning stale reads, occasionally because L1 itself is unhealthy — the DB-based staleness check sees a frozen `current_safe_block` and may fail to trigger. The danger detector uses two wall-clock signals: the recorded L1 safe block timestamp must remain younger than `SEQ_L1_READ_STALE_AFTER_BLOCKS`, and unresolved batches are also checked with `estimated_missed_blocks = (now − last_safe_progress_ms) / seconds_per_block` by adjusting the danger threshold downward. This prevents silently issuing doomed soft confirmations during stale-provider periods or L1 outages. + +### Formal verification + +The preemptive recovery design is verified by bounded TLA+ model checking. See [`docs/recovery/`](docs/recovery/) for the full design, TLA+ specs, and design history. When touching recovery code, read the TLA+ first. + +## Threat Model (brief) + +See [`docs/threat-model/README.md`](docs/threat-model/README.md) for the full model. Key points when reading or writing code: + +- **Trusted:** InputBox contract, our own Ethereum node (fail-stop, not byzantine), operator config, batch-submitter key. +- **Adversarial:** `POST /tx` callers, direct-input senders, the L1 mempool and block builders (zombie transactions are a first-class threat). +- **Semi-trusted, fail-stop:** fallback RPC providers (Infura / Alchemy). +- **Self-trust:** the sequencer trusts its own code is correct. Bugs that emit malformed batches are fault states requiring manual intervention, not threats to defend against at runtime. +- **In scope:** correctness bugs *and* exploitation. Under rollup semantics, a correctness bug that causes scheduler/sequencer state divergence is as severe as direct theft. ## Architecture Map -- `sequencer/src/main.rs`: thin binary entrypoint. -- `sequencer/src/lib.rs`: public sequencer API (`run`, `RunConfig`). -- `sequencer/src/config.rs`: runtime input parsing and EIP-712 domain construction. -- `sequencer/src/runtime.rs`: bootstrap and runtime wiring. -- `sequencer/src/api/mod.rs`: `POST /tx` and `GET /ws/subscribe` endpoints (tx ingress + replay feed). -- `sequencer/src/api/error.rs`: API error model + HTTP mapping. -- `sequencer/src/inclusion_lane/mod.rs`: inclusion-lane exports and public surface. -- `sequencer/src/inclusion_lane/lane.rs`: batched execution/commit loop (single lane). -- `sequencer/src/inclusion_lane/types.rs`: inclusion-lane queue item and pipeline error types. -- `sequencer/src/inclusion_lane/error.rs`: inclusion-lane runtime and catch-up error types. -- `sequencer/src/input_reader/`: safe-input ingestion from InputBox into SQLite. -- `sequencer/src/l2_tx_feed/mod.rs`: DB-backed ordered-L2Tx feed used by WS subscriptions. -- `sequencer/src/storage/mod.rs`: DB open, migrations, frame persistence, and direct-input broker APIs. -- `sequencer/src/storage/migrations/`: DB schema/bootstrapping (`0001`). -- `sequencer-core/src/`: shared domain types/interfaces (`Application`, `SignedUserOp`, `SequencedL2Tx`, broadcast message model). -- `examples/app-core/src/application/mod.rs`: wallet prototype implementing `Application`. -- `tests/benchmarks/src/`: benchmark harnesses and self-contained benchmark runtime. - -## Domain Truths (Important) - -- This is a **sequencer prototype**, not a full DeFi stack yet. -- API validates signature and enqueues signed `UserOp`; method decoding happens during application execution. -- Deposits are direct-input-only (L1 -> L2) and must not be represented as user ops. -- Rejections (`InvalidNonce`, fee cap too low, insufficient gas balance) produce no state mutation and are not persisted. -- Included txs are persisted as frame/batch data in `batches`, `frames`, `user_ops`, `safe_inputs`, and `sequenced_l2_txs`. -- Frame fee is persisted in `frames.fee` and is fixed for the lifetime of that frame. -- The next frame fee is sampled from `batch_policy_derived.recommended_fee` when rotating to a new frame (defaults follow `batch_policy` bootstrap rows; tune `gas_price` / `alpha` via SQLite if needed). -- `/ws/subscribe` currently has internal guardrails: subscriber cap `64`, catch-up cap `50000`. -- When that catch-up window is exceeded, `/ws/subscribe` upgrades and then closes with websocket close code `1008` (`POLICY`) and reason `catch-up window exceeded`. -- Wallet state (balances/nonces) is in-memory right now (not persisted). -- EIP-712 domain name/version are fixed in code; chain ID and verifying contract come from `SEQ_CHAIN_ID` and `SEQ_APP_ADDRESS` (validated against the RPC chain id at startup). +Top-level layout follows the system's data flow. Each sequencer module corresponds to a writer role; the matching `storage/.rs` holds its storage half. + +### Workspace + +- `sequencer/` — main sequencer binary and library. +- `sequencer-core/` — shared domain types (`Application`, `SignedUserOp`, `SequencedL2Tx`, `Batch`, `Frame`). +- `examples/app-core/` — placeholder wallet app implementing the `Application` trait. +- `examples/canonical-app/` — on-chain scheduler reference implementation. +- `examples/canonical-test/` — e2e test harness for the canonical app. +- `sdk/rust-client/` — Rust client library for the sequencer API. +- `tests/{benchmarks,e2e,harness}/` — test infrastructure. + +### Sequencer module layout + +- `sequencer/src/main.rs` — thin binary entrypoint. +- `sequencer/src/lib.rs` — public sequencer API (`run`, `RunConfig`). +- `sequencer/src/http.rs` — shared HTTP error type, JSON `ErrorResponse`, `ApiConfig`, and `axum::serve` orchestration. +- `sequencer/src/runtime/` — process bootstrap, `RunConfig`, EIP-712 domain, `ShutdownSignal`, shared `clock::unix_now_ms`. +- `sequencer/src/ingress/` — public write path. + - `api.rs` — `POST /tx` handler, JSON-rejection mapping. + - `inclusion_lane/` — single-lane hot-path loop (`mod.rs`), catch-up replay, config, error types. +- `sequencer/src/egress/` — internal read path. + - `api/` — `/ws/subscribe`, `/livez`, `/readyz`, `/healthz`. + - `l2_tx_feed/` — DB-backed ordered-tx feed. +- `sequencer/src/l1/` — L1 client surface. + - `reader.rs` — safe-input ingestion from InputBox into SQLite. + - `submitter/` — stateless batch submitter (`worker.rs` + `poster.rs`). + - `provider.rs` — alloy provider construction. + - `partition.rs` — long-block-range retry helper. +- `sequencer/src/recovery/` — preemptive recovery startup procedure (`mod.rs`), runtime danger detector (`detector.rs`), and mempool flusher (`flusher.rs`). +- `sequencer/src/storage/` — SQLite persistence, split by writer role (`ingress`, `egress`, `l1_inputs`, `l1_submission`, `recovery`, `admin`, plus shared `mod`, `open`, `internals`, and `migrations/`). + +## Key Concepts + +- **Chunk** — bounded list of user ops processed and persisted together to amortize SQLite cost. +- **Frame** — ordering boundary; commits `safe_block` + user ops. +- **Batch** — list of frames posted on-chain as one L1 transaction (SSZ-encoded). +- **Inclusion lane** — hot-path single-lane loop that dequeues, executes, persists, and rotates frame/batch boundaries. The only writer of open batch/frame state. +- **Batch submitter** — stateless worker that bulk-submits all pending batches each tick. Nonces are assigned by storage (structural `parent.nonce + 1`) when batches are closed; the submitter just reads them. +- **Danger detector** — background worker that polls `Storage::check_danger` on a fixed cadence and exits with `RecoveryRequired` when any non-`Safe` danger status fires. Never writes to the DB; never talks to L1. Crashes the process so startup recovery or refusal can run. +- **Input reader** — ingests safe inputs from L1 InputBox into SQLite. +- **L2 tx feed** — DB-backed ordered-tx stream used by WS subscribers. +- **Soft confirmation** — sequencer's predicted ordering, emitted before the batch lands on L1. + +## Domain Truths + +- API validates the EIP-712 signature and enqueues a `SignedUserOp`. Method payload decoding happens during application execution, not at ingress. +- **Deposits are direct-input-only** (L1 → L2) and must not be represented as user ops. +- Rejections (`InvalidNonce`, `InvalidMaxFee`, `InsufficientGasBalance`) produce no state mutation and are not persisted. +- Included txs are persisted as frame/batch data in `batches`, `frames`, `user_ops`, `safe_inputs`, and `sequenced_l2_txs`. Recovery metadata lives in `safe_accepted_batches`; batch lifecycle state (sealed/invalidated) lives on the `batches` row itself as write-once timestamps. +- Frame fee is persisted in `frames.fee` and is fixed for the lifetime of that frame. The next frame's fee is sampled from `batch_policy_derived.recommended_fee` at rotation. +- Wallet state (balances, nonces) is in-memory today — not persisted. +- **EIP-712 domain fields:** `name`, `version`, `chainId`, `verifyingContract`. `chainId` and `verifyingContract` come from `SEQ_CHAIN_ID` and `SEQ_APP_ADDRESS` (validated against the RPC chain id at startup). All four fields must be present on both sides — both the sequencer and the on-chain scheduler construct the domain via `sequencer_core::build_input_domain`, the canonical shared constructor. + +### InputBox payload classification + +- The input reader ingests every `InputAdded` event from InputBox. Each event carries an authenticated `msg_sender` (delivered by the Cartesi framework from `EvmAdvanceCall`). +- **Classification is by sender address**, not by a tag byte: + - Sender == batch-submitter address → SSZ-decoded as `Batch` (scheduler side). The sequencer does not ingest its own batch submissions as direct inputs. + - Any other sender → stored verbatim as a direct input (deposit). +- The payload is opaque to the classification layer. Application-specific decoding happens inside `Application::execute_direct_input`. + +## Application Trait Contract + +Implementors of the `Application` trait must respect these contracts. The sequencer assumes them without runtime enforcement. + +### Replay determinism + +The sequencer persists every included user op and every ingested direct input. On restart, catch-up replays them in order against a fresh `Application` instance to rebuild state. **Any input that succeeded live must succeed on replay.** + +- `execute_direct_input` and `execute_valid_user_op` must not return `AppError::Internal` for any byte sequence that previously executed successfully. Catch-up treats `Internal` as fatal: it aborts startup and leaves the sequencer unable to resume. +- Prefer `ExecutionOutcome::Invalid` for malformed or ill-typed input caught at the app level. Reserve `AppError::Internal` for genuine invariant violations ("validated user op cannot pay fee") — real bugs, not adversarial inputs. `Invalid` is replay-safe; `Internal` is not. +- `validate_user_op` must be pure over the current app state. No side effects, no time dependence, no randomness. + +### No implicit state + +Application state changes must flow exclusively through `execute_valid_user_op` and `execute_direct_input`. Mutating state from `validate_user_op` or `current_user_nonce` breaks replay determinism. ## Hot-Path Invariants - API ack is tied to chunk durability, not frame/batch closure. - Chunk commit and ack remain low-latency; frame closure is orthogonal and can happen less frequently. -- API overload for `POST /tx` is currently defined by inclusion-lane queue admission: if `try_send` hits a full queue, the handler returns `429 OVERLOADED` with message `queue full`. +- `POST /tx` queue admission: `try_send` on a full queue returns `429 OVERLOADED` with message `queue full`. - Frame closure happens when direct inputs are drained, and also whenever batch closure happens. - Batch closure is controlled by batch policy (size and/or deadline). -- Preserve single-lane deterministic ordering; do not introduce extra concurrency in hot-path ordering logic without explicit approval. +- Preserve single-lane deterministic ordering. Do not introduce extra concurrency in hot-path ordering logic without explicit approval. ## Storage Invariants - Storage model is append-oriented; avoid mutable status flags for open/closed entities. -- Open batch/frame are derived by “latest row” convention. -- A frame’s leading direct-input prefix is derivable from `sequenced_l2_txs` plus `frames.safe_block`. -- `safe_inputs` contains only L1 app direct input **bodies**. InputBox payload first byte: **0x00** = direct input (tag stripped, body stored and executed), **0x01** = batch submission (for scheduler, not stored), **others** = discarded (invalid/garbage). The input reader only accepts 0x00-tagged payloads and stores `payload[1..]`. +- Open batch/frame are derived by "latest row" convention. +- A frame's leading direct-input prefix is derivable from `sequenced_l2_txs` plus `frames.safe_block`. - Safe cursor/head values should be derived from persisted facts when possible, not duplicated as mutable fields. -- Replay/catch-up must use persisted ordering plus persisted frame fee (`frames.fee`) to mirror inclusion semantics. -- Included user-op identity is constrained by `UNIQUE(sender, nonce)`. +- Replay/catch-up uses persisted ordering plus persisted frame fee (`frames.fee`) to mirror inclusion semantics exactly. +- Cursor pagination for ordered L2 txs uses **SQLite rowid**, not count-based offsets. Holes from invalidated batches would break count-based pagination. +- Included user-op identity is tracked by application nonce logic; no DB uniqueness constraint (removed to allow resubmission after recovery). +- **Reads over batch data go through `valid_batches`, `valid_closed_batches`, `valid_open_batch`, and `valid_sequenced_l2_txs` views.** These encapsulate the "exclude invalidated rows" filter so individual queries don't repeat it. Writers go to the base tables. +- **`batches` row columns partition cleanly by writer.** `sealed_at_ms` is owned by the inclusion lane (set when closing a batch); `invalidated_at_ms` is owned by recovery (set during cascade). Each is write-once (NULL → non-NULL, never back) and enforced by triggers. The partial unique index `ux_single_valid_tip` guarantees at most one row has both NULL — the Tip. +- The inclusion lane is the **only writer** of open batch/frame state. `Storage::append_user_ops_chunk` and the `close_*` methods trust the in-memory `WriteHead`; FK + PK constraints catch the dangerous failure modes. ## Type Boundaries -- `SignedUserOp`: ingress/API signature domain. -- `ValidUserOp`: app execution domain after validation boundary. -- `SequencedL2Tx`: ordered replay/fanout domain (`UserOp | DirectInput`). -- Keep private DB-only helper/intermediary types private to storage modules; prefer shared domain types at module boundaries. +- `SignedUserOp` — ingress/API signature domain (post-validation, pre-execution). +- `ValidUserOp` — application execution domain (after validation boundary). +- `SequencedL2Tx` — ordered replay/fanout domain (`UserOp | DirectInput`). +- Keep DB-only helper types private to storage modules; prefer shared domain types at module boundaries. + +## HTTP Endpoints + +- **Ingress** (public-facing): `POST /tx`. +- **Egress** (internal indexers): `GET /ws/subscribe`, `GET /livez`, `GET /readyz`, `GET /healthz`. + +Today both sides serve from one listener; the planned API split puts each side on its own port (same binary) so internal probes and subscribers can be firewalled from public submit traffic. + +`/ws/subscribe` internal guardrails: subscriber cap 64, catch-up cap 50000. When the catch-up window is exceeded, the handler upgrades and then closes with WebSocket close code `1008` (`POLICY`), reason `catch-up window exceeded`. + +Health semantics: `/livez` — 200 if the process is alive. `/readyz` — 200 if shutdown not requested AND inclusion-lane channel still open, else 503. `/healthz` — JSON `{ status, inclusion_lane }` mirroring the same 200/503. + +## Environment Variables + +**Required:** + +- `SEQ_ETH_RPC_URL` +- `SEQ_CHAIN_ID` +- `SEQ_APP_ADDRESS` +- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` or `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` + +**Optional:** + +- `SEQ_HTTP_ADDR` (default `127.0.0.1:3000`) +- `SEQ_DATA_DIR` (default `sequencer-data`; DB file `sequencer.db` inside it) +- `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` +- `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS` (default 5000) +- `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH` (default 2) +- `SEQ_PREEMPTIVE_MARGIN_BLOCKS` (default 300, ~1h at 12s/block) +- `SEQ_L1_READ_STALE_AFTER_BLOCKS` (default derived before the danger threshold) +- `SEQ_SECONDS_PER_BLOCK` (default 12) + +## Coding Conventions + +- Prefer small, composable functions at module boundaries (`ingress::api` → `ingress::inclusion_lane` → `storage::ingress`; `egress::l2_tx_feed` ← `storage::egress`). +- Keep application validation and execution deterministic for a given input/state. No `SystemTime::now()`, `HashMap` iteration order, or floating-point in consensus paths. +- Surface user-facing errors via `ApiError` (in `http.rs`); keep internal failures descriptive but safe. +- Avoid introducing heavy dependencies without strong reason. +- Documentation style: lean. Module headers (1–4 lines) + docs on public methods only when the contract isn't obvious from name+signature. Use inline comments for **why**, never for **what**. +- **Don't layer defense-in-depth checks against sequencer self-bugs.** Correctness is enforced via tests and review. See "Self-trust" in [`docs/threat-model/README.md`](docs/threat-model/README.md). + +## Testing Guidance + +Focus tests on: -## Agent Priorities +- Signature + sender-validation edge cases. +- Nonce progression rules. +- Fee and rejection behavior. +- Included-vs-rejected commit behavior. +- Storage batch atomicity and uniqueness constraints. +- Scheduler/sequencer agreement — any invariant the two sides share should have at least one test that exercises both. -When making changes, optimize for: -1. Deterministic sequencing semantics. -2. Safety and correctness of transaction validation/execution. -3. Clear, testable boundaries between API, application logic, and storage. -4. Backward-compatible, explicit error handling. -5. Minimal, focused diffs. +Prefer black-box tests around `POST /tx` and commit outcomes for integration. + +Some `sequencer` tests use Anvil (Foundry). They run by default and fail with a clear message if `anvil` is not on PATH. Install Foundry or use `nix develop`. ## Fast Start Commands -Run from repo root: +See [`CLAUDE.md`](CLAUDE.md) for shell setup and the full command list. In short: ```bash cargo check -cargo test +cargo test --workspace --exclude canonical-test cargo fmt --all cargo clippy --all-targets --all-features -- -D warnings ``` @@ -119,34 +293,22 @@ SEQ_BATCH_SUBMITTER_PRIVATE_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5e cargo run -p sequencer ``` -Optional env vars: -- `SEQ_HTTP_ADDR` -- `SEQ_DATA_DIR` (default `sequencer-data`; DB file `sequencer.db` inside it) -- `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` (alternative to `SEQ_BATCH_SUBMITTER_PRIVATE_KEY`) -- `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`, `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH` - -Required env vars: -- `SEQ_ETH_RPC_URL` -- `SEQ_CHAIN_ID` -- `SEQ_APP_ADDRESS` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` or `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` - ## Always / Ask First / Never ### Always -- Keep behavior explicit for transaction inclusion vs rejection. -- Preserve API error shape and status code mapping unless intentionally changing API contract. +- Keep inclusion-vs-rejection semantics explicit for transaction handling. +- Preserve API error shape and status code mapping unless intentionally changing the API contract. - Add or update tests when logic changes. - Run at least `cargo check` before finishing. +- Read `docs/recovery/` before touching recovery code, and `docs/threat-model/` before touching trust-boundary code. ### Ask First - Changing tx wire format (`UserOp`, SSZ payload layout, EIP-712 domain fields). - Changing DB schema or migration strategy. - Altering rejection semantics (what consumes nonce/gas vs what is rejected). -- Introducing concurrency changes to commit ordering guarantees. +- Introducing concurrency changes to commit ordering. - Changing chunk/frame/batch closure or ack semantics. ### Never @@ -156,50 +318,25 @@ Required env vars: - Rely on implicit defaults for consensus-relevant values. - Remove guardrails around queue backpressure or inclusion-lane error reporting. -## Coding Conventions for This Repo - -- Prefer small, composable functions at module boundaries (`api` -> `application` -> `storage`). -- Keep application validation/execution deterministic for a given input/state. -- Surface user-facing errors via `ApiError`; keep internal failures descriptive but safe. -- Avoid introducing heavy dependencies without strong reason. - -## Testing Guidance - -Focus tests on: -- signature + sender validation edge cases -- nonce progression rules -- fee/rejection behavior -- included vs rejected commit behavior -- storage batch atomicity and uniqueness constraints +## Migration Policy -If adding integration tests, prefer black-box tests around `POST /tx` and commit outcomes. +At this stage it is acceptable to rewrite baseline migrations for clarity. There are no deployed environments requiring forward-only migrations. Keep schema bootstrap (initial open rows and invariants) explicit and deterministic. -Some `sequencer` tests use Anvil and are opt-in locally: +Once environments are shared or deployed, switch to append-only forward migrations. -```bash -RUN_ANVIL_TESTS=1 cargo test -p sequencer --lib -``` +## Definition of Done -## Definition of Done for Agent Changes +Before finishing a change, ensure: -Before finishing, ensure: 1. Code compiles (`cargo check`). -2. Changed behavior is covered by tests (or explain why tests are pending). -3. Formatting/lints are clean (or list any unresolved warnings explicitly). -4. PR summary includes: - - what changed - - why it changed - - risk/compatibility notes +2. Changed behavior is covered by tests, or explain why tests are pending. +3. Formatting and lints are clean, or list any unresolved warnings explicitly. +4. PR summary includes **what changed**, **why it changed**, and **risk / compatibility notes**. -## Near-Term Roadmap Hints - -Expected future evolution areas: -- stronger typing around tx metadata -- persistence for app state or deterministic replay -- explicit L1 block progression input - -## Migration Policy +## Related Documents -- Current prototype stage: it is acceptable to rewrite baseline migrations for clarity. -- Once environments are shared/deployed: switch to append-only forward migrations. -- Keep schema bootstrap (initial open rows/invariants) explicit and deterministic. +- [`README.md`](README.md) — product framing, user-facing trust model. +- [`CLAUDE.md`](CLAUDE.md) — shell setup, quick reference, pointer back here. +- [`docs/threat-model/README.md`](docs/threat-model/README.md) — trust boundaries, in-scope and out-of-scope threats. +- [`docs/recovery/README.md`](docs/recovery/README.md) — recovery design, TLA+ formal verification, design history. +- [`sequencer-core/`](sequencer-core/) — shared domain types and protocol contracts. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..492bc9b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,57 @@ +# CLAUDE.md + +Quick reference for working in this repository. For the full guide — architecture, duality, recovery, invariants, threat model, and rules — read [`AGENTS.md`](AGENTS.md). + +## Shell Environment + +This project uses Nix + direnv. Before running any command that needs project tools (Foundry, TLA+, etc.), activate the direnv environment: + +```bash +eval "$(direnv export bash 2>/dev/null)" +``` + +This makes `anvil`, `forge`, `cast`, `tlc`, and other Nix-provided tools available. Cargo and rustc are available without direnv. + +## Commands + +```bash +cargo check # compile check +cargo test --workspace --exclude canonical-test # run tests (canonical-test needs libslirp) +cargo fmt --all # format +cargo clippy --all-targets --all-features -- -D warnings # lint +cargo test -p sequencer --lib # includes Anvil-backed tests (needs Foundry on PATH) +``` + +## What This Is + +Off-chain sequencer for an app-specific DeFi rollup. Accepts signed user operations, issues low-latency soft confirmations, and posts batches to L1. Currently backed by a placeholder wallet app (transfer, withdrawal). **Security-critical infrastructure** — handle every change accordingly. + +Rust edition 2024 / Axum API / SQLite (rusqlite, WAL) / EIP-712 signing / SSZ encoding. + +## Workspace Layout + +- `sequencer/` — main sequencer binary and library. +- `sequencer-core/` — shared domain types consumed by both sequencer and scheduler. +- `examples/app-core/` — placeholder wallet app implementing `Application`. +- `examples/canonical-app/` — on-chain scheduler reference implementation. +- `examples/canonical-test/` — e2e test harness for the canonical app. +- `sdk/rust-client/` — Rust client library for the sequencer API. +- `tests/{benchmarks,e2e,harness}/` — test infrastructure. + +## Sequencer Module Layout + +`sequencer/src/` is organized by writer role; `storage/.rs` holds each role's storage half. + +- `runtime/` — bootstrap, config, shutdown, shared clock. +- `ingress/` — public write path: `api.rs` (`POST /tx`) + `inclusion_lane/` (hot path). +- `egress/` — internal read path: `api/` (WS subscribe + health) + `l2_tx_feed/`. +- `l1/` — reader, submitter, provider, partition helper. +- `recovery/` — startup preemptive-recovery procedure, runtime danger detector, mempool flusher. +- `storage/` — SQLite persistence, split per writer role. +- `http.rs` — shared HTTP error type + `axum::serve` orchestration. + +## Before You Start Real Work + +- **[`AGENTS.md`](AGENTS.md)** — mission, requirements, invariants, duality, recovery, conventions, rules. +- **[`docs/threat-model/README.md`](docs/threat-model/README.md)** — trust boundaries and in-scope threats. +- **[`docs/recovery/README.md`](docs/recovery/README.md)** — preemptive recovery design + TLA+ proofs. diff --git a/Cargo.lock b/Cargo.lock index 2d42e6b..1156d1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3581,6 +3581,7 @@ dependencies = [ "ethereum_ssz", "futures-util", "k256", + "rusqlite", "sequencer-core", "sequencer-rust-client", "serde", @@ -3870,6 +3871,7 @@ dependencies = [ "ethereum_ssz_derive", "futures-util", "k256", + "rollups-harness", "rusqlite", "rusqlite_migration", "sequencer-core", diff --git a/README.md b/README.md index d8b8997..d2a555e 100644 --- a/README.md +++ b/README.md @@ -1,189 +1,100 @@ -# Sequencer Prototype +# Sequencer -Prototype sequencer, currently backed by a dummy wallet app (`Transfer`, `Withdrawal`). +A sequencer for Cartesi app-specific rollups. Provides low-latency soft confirmations for user operations, posts them to L1 in batches, and maintains a deterministic replay feed that matches the application's final execution order. -Current focus is reliability of sequencing, persistence, and replay semantics. +**Security-critical infrastructure.** Handle every change with the care financial systems demand. -## Status +## What It Does -- Language: Rust (edition 2024) -- API: Axum (`POST /tx`, `GET /ws/subscribe`) -- Hot path: single blocking inclusion lane -- Storage: SQLite (`rusqlite`, WAL) -- Signing: EIP-712 (`alloy`) -- Payload encoding: SSZ +Rollup applications need fast transaction confirmations. Waiting for L1 finality on every user action (minutes) makes interactive applications impractical. The sequencer bridges this gap: it accepts signed user operations, immediately confirms them (soft confirmation), and asynchronously posts batches to L1. The application sees these batches posted on chain. -## Core Design +The core guarantee: **the off-chain sequencer and the rollup's on-chain scheduler produce identical execution order.** Users get instant feedback while the system converges to L1 truth. -- **User ops** arrive through the API, are validated, executed, and persisted by the inclusion lane. -- **Direct inputs** are stored in SQLite (`safe_inputs`) and sequenced in append-only replay order (`sequenced_l2_txs`). -- **Deposits** are direct-input-only (L1 -> L2) and are not accepted as user ops. -- **Ordering** is deterministic and persisted. Replay/catch-up reads `sequenced_l2_txs` joined with `user_ops` and `safe_inputs`. -- **Frame fee** is fixed per frame (`frames.fee`): - - users sign `max_fee` - - inclusion validates `max_fee >= current_frame_fee` - - execution charges `current_frame_fee` - - when opening a new frame or batch, the sequencer samples **`recommended_fee`** from the `batch_policy_derived` SQLite view (derived from `gas_price`, amortization `alpha`, and on-chain DA constants in `batch_policy`) -- **Batch closure by size** uses **`batch_size_target`** from the same view (stored on `WriteHead` as `max_batch_user_op_bytes`). The inclusion lane compares it to a **worst-case estimate** of in-batch user-op bytes (`batch_user_op_count × (per-op metadata cap + max method payload)`), not the exact SSZ-encoded batch size. A **time-based** max open duration also closes batches. +## Two Chains Synchronizing -## Quick Start +The sequencer maintains an optimistic chain of batches — a tree that normally degenerates into a list. Each batch contains frames, and each frame contains user operations plus a `safe_block` reference. The `safe_block` is the synchronization primitive: it tells the on-chain scheduler "drain all direct inputs (deposits) up to this L1 block, then execute these user ops." Both sides follow the rule, producing identical state. -From repo root: - -```bash -cargo check -cargo test -cargo fmt --all -cargo clippy --all-targets --all-features -- -D warnings ``` - -Run the server (example uses Anvil account #0 as batch submitter; use your own key in production): - -```bash -SEQ_ETH_RPC_URL=http://127.0.0.1:8545 \ -SEQ_CHAIN_ID=31337 \ -SEQ_APP_ADDRESS=0x1111111111111111111111111111111111111111 \ -SEQ_BATCH_SUBMITTER_PRIVATE_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80 \ -cargo run -p sequencer +Sequencer (off-chain) Scheduler (on-chain) + frame: safe_block=100 drain directs up to block 100 + user_ops=[A, B, C] execute A, B, C + frame: safe_block=105 drain directs up to block 105 + user_ops=[D] execute D ``` -At startup the process checks that the RPC `eth_chainId` matches `SEQ_CHAIN_ID`. - -Optional runtime inputs: - -- `SEQ_HTTP_ADDR` defaults to `127.0.0.1:3000` -- `SEQ_DATA_DIR` defaults to `sequencer-data` (SQLite file is `sequencer.db` inside that directory; the directory is created if missing) -- `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` defaults to `-32005,-32600,-32602,-32616` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` instead of `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` (first line of the file is the key) -- `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`, `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH` - -Required runtime inputs: +When things go well, the sequencer's chain and the scheduler's view converge. When they don't — batches arrive stale on L1 — the sequencer detects the divergence and recovers. -- `SEQ_ETH_RPC_URL` -- `SEQ_CHAIN_ID` -- `SEQ_APP_ADDRESS` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` or `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` +## Trust Model -Fixed protocol identity (EIP-712): +The sequencer is a **centralized, single-writer** system. It cannot steal funds or forge invalid state — the rollup validates everything independently, and the proof system later enforces it. But the sequencer can: -- domain name: `CartesiAppSequencer` -- domain version: `1` -- `chain_id` and `verifying_contract` come from `SEQ_CHAIN_ID` and `SEQ_APP_ADDRESS` +- **Censor** — refuse to include a user's operations. +- **Go offline** — stop providing soft confirmations. +- **Diverge** — if batches fail to land on L1 in time, soft confirmations that were issued become invalid. -Most queue sizes, polling intervals, and safety limits are now internal runtime constants instead of public launch-time configuration. +**Direct inputs** (L1 → L2 messages, used for deposits) bypass the sequencer entirely. They are posted directly to L1 and are **uncensorable** by the sequencer — the scheduler drains them at every `safe_block` boundary. A censoring sequencer can delay when a direct input is executed (up to `MAX_WAIT_BLOCKS`, ~4h), but cannot prevent it. -## API +The third case is handled by the recovery subsystem. Batches that are too old when they reach L1 (`inclusion_block − safe_block ≥ MAX_WAIT_BLOCKS`) are skipped by the scheduler. This "staleness" poisons the nonce counter: all subsequent batches become unreachable regardless of their individual freshness. The sequencer detects this via a danger-zone threshold, preemptively goes offline, flushes the L1 mempool, and cascade-invalidates the doomed chain. See [`docs/recovery/`](docs/recovery/) for the full design, TLA+ formal verification, and design history. -### `POST /tx` +The sequencer trusts its own code is bug-free. Recovery means recovery from liveness failures, which can legitimately happen even in the absence of bugs (infrastructure outages, network failures, gateway failure). Code-level bugs are a separate problem handled by tests and review. See [`docs/threat-model/README.md`](docs/threat-model/README.md) for the complete threat model applied across the codebase. -Request shape: +## Failure Modes -```json -{ - "message": { - "nonce": 0, - "max_fee": 1, - "data": "0x..." - }, - "signature": "0x...", - "sender": "0x..." -} -``` - -Notes: +The sequencer is designed to handle: -- `signature` must be 65 bytes. -- `sender` is required and must match the recovered signer. -- `message.data` is SSZ-encoded method payload bytes. -- payload size is bounded at ingress; oversized requests are rejected before entering the hot path. -- overload is enforced at queue admission: if the inclusion-lane queue is full, `POST /tx` returns HTTP `429` with code `OVERLOADED` and message `queue full`. -- queue capacity is an internal runtime constant tuned alongside inclusion-lane chunking to absorb short bursts; if this starts triggering persistently, it is a signal to revisit runtime sizing or throughput rather than add another admission layer. +- **L1 provider outages** — workers retry with exponential backoff. The inclusion lane and API continue operating locally. A wall-clock fallback detects when an outage pushes batches into the danger zone. +- **Process crashes** — recovery runs at startup. All recovery state is derived from SQLite (atomic transactions) and L1 safe state. No external coordination needed. +- **Extended downtime** — on restart, the sequencer syncs to the current L1 safe head, flushes if needed, and recovers. +- **Adversarial L1 mempool** — block builders and private mempools are treated as adversarial. The recovery flusher consumes every pending nonce slot with a no-op so delayed "zombie" submissions cannot land later. -### `GET /ws/subscribe?from_offset=` +## Interfaces -WebSocket stream of sequenced L2 transactions from persisted order. +### User Operations -Notes: +Users submit signed operations via `POST /tx` (JSON). Operations are signed with EIP-712 using the rollup's chain ID and app address. The sequencer validates the signature, executes the operation against the current app state, and returns a soft confirmation. -- `from_offset` is optional and defaults to `0`. -- messages are JSON text frames. -- binary fields are hex-encoded (`0x`-prefixed). -- the current runtime enforces a subscriber cap of `64` and a catch-up cap of `50000` events. -- if the requested catch-up window exceeds that cap, the server upgrades and then immediately closes the socket with close code `1008` (`POLICY`) and reason `catch-up window exceeded`. +### Sequenced Transaction Feed -Message shapes: +Subscribers connect via `GET /ws/subscribe?from_offset=` (WebSocket). The feed delivers all sequenced transactions (user ops + direct inputs) in deterministic order, matching the on-chain execution order. This is the primary interface for downstream consumers (frontends, indexers). The endpoint is designed for a small number of indexer subscribers, which serve users directly. -```json -{ "kind": "user_op", "offset": 10, "sender": "0x...", "fee": 1, "data": "0x..." } -``` +### Batch Submission -```json -{ "kind": "direct_input", "offset": 11, "payload": "0x..." } -``` +The batch submitter posts closed batches to L1's InputBox contract. Each batch carries a sequential nonce for deduplication; L1 wallet nonces guarantee ordering. The submitter is stateless — it derives pending work from SQLite and L1 state each tick. -Success response: +## Running -```json -{ - "ok": true, - "sender": "0x...", - "nonce": 0 -} +```bash +SEQ_ETH_RPC_URL=http://127.0.0.1:8545 \ +SEQ_CHAIN_ID=31337 \ +SEQ_APP_ADDRESS=0x1111111111111111111111111111111111111111 \ +SEQ_BATCH_SUBMITTER_PRIVATE_KEY=0xac09...f2ff80 \ +cargo run -p sequencer ``` -## Storage Model - -- `batches`: batch metadata -- `frames`: frame boundaries within each batch -- `frames.fee`: committed fee for each frame -- `user_ops`: included user operations -- `sequenced_l2_txs`: append-only ordered replay rows (`UserOp` xor `DirectInput`); inserting into `user_ops` also appends the corresponding replay row via trigger `trg_sequence_user_op` -- `safe_inputs`: direct-input payload stream -- `batch_policy`: singleton knobs and constants for DA-style batch sizing and fee derivation; `batch_policy_derived` view exposes `recommended_fee` and `batch_size_target` - -## Project Layout - -- `sequencer/src/main.rs`: thin binary entrypoint -- `sequencer/src/lib.rs`: public crate surface -- `sequencer/src/config.rs`: runtime input parsing and EIP-712 domain construction -- `sequencer/src/runtime.rs`: sequencer bootstrap and component wiring -- `sequencer/src/api/`: HTTP API and error mapping -- `sequencer/src/inclusion_lane/`: hot-path inclusion loop, chunk/frame/batch rotation, catch-up -- `sequencer/src/input_reader/`: safe-input ingestion from InputBox into SQLite -- `sequencer/src/l2_tx_feed/`: DB-backed ordered-L2Tx feed for WS subscriptions -- `sequencer/src/storage/`: schema, migrations, SQLite persistence, and replay reads -- `sequencer-core/src/`: shared domain types and interfaces (`Application`, `SignedUserOp`, `SequencedL2Tx`, feed message types) -- `examples/app-core/src/`: wallet prototype implementing `Application` -- `tests/benchmarks/`: benchmark harnesses and benchmark spec +Required: `SEQ_ETH_RPC_URL`, `SEQ_CHAIN_ID`, `SEQ_APP_ADDRESS`, `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` (or `_FILE`). -## Prototype Limits +Optional: `SEQ_HTTP_ADDR` (default `127.0.0.1:3000`), `SEQ_DATA_DIR` (default `sequencer-data`), `SEQ_PREEMPTIVE_MARGIN_BLOCKS` (default `300`), `SEQ_SECONDS_PER_BLOCK` (default `12`), `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`, `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH`. -- Wallet state is in-memory and not persisted. -- Schema and migrations are still in prototype mode and may change. - -## Local Test Prerequisites - -- Some `sequencer` tests spin up `Anvil`; install Foundry locally if you want the full test suite: -- Self-contained benchmarks also spawn `Anvil` from a preloaded rollups state dump. +## Development ```bash -foundryup +cargo check # compile +cargo test --workspace --exclude canonical-test # test (canonical-test needs libslirp) +cargo fmt --all # format +cargo clippy --all-targets --all-features -- -D warnings # lint ``` -- Prepare local benchmark + guest build dependencies: - -```bash -just setup -``` +Some tests require [Foundry](https://getfoundry.sh) (`anvil` on PATH). They run by default and fail with a clear message if unavailable. This project uses Nix + direnv for tooling — `direnv allow` provides Foundry, TLA+, and other dependencies. -- Enable the Anvil-backed reader tests explicitly: +## Further Reading -```bash -RUN_ANVIL_TESTS=1 cargo test -p sequencer --lib -``` +- [`AGENTS.md`](AGENTS.md) — developer guide: architecture, conventions, duality, recovery, invariants, rules. +- [`CLAUDE.md`](CLAUDE.md) — quick reference for shell setup and commands. +- [`docs/threat-model/README.md`](docs/threat-model/README.md) — trust boundaries, in-scope and out-of-scope threats. +- [`docs/recovery/README.md`](docs/recovery/README.md) — recovery design, TLA+ formal verification, design history. +- [`sequencer-core/`](sequencer-core/) — shared domain types (`Application`, `SignedUserOp`, `Batch`, `Frame`). +- [`examples/app-core/`](examples/app-core/) — placeholder wallet app implementing the `Application` trait. ## License -Apache-2.0. See `LICENSE`. - -Authors are listed in `AUTHORS`. +Apache-2.0. See [`LICENSE`](LICENSE). Authors in [`AUTHORS`](AUTHORS). diff --git a/TODO.md b/TODO.md deleted file mode 100644 index f9e02e5..0000000 --- a/TODO.md +++ /dev/null @@ -1,83 +0,0 @@ -# TODO - -## North Star - -Build a robust sequencer prototype for a future DeFi stack, with deterministic ordering, low-latency acks, and strong replay/canonical alignment. - ---- - -## Done - -### Sequencer Foundation - -- Thin binary entrypoint plus library runtime (`sequencer::run`, `RunConfig`). -- Simplified runtime/config surface with explicit EIP-712 deployment inputs. -- Hardened write path: API -> inclusion lane -> app execution -> persistence -> ack. -- `L2Tx` broadcaster with WebSocket fanout of ordered `L2Tx`s. -- Bounded WebSocket catch-up window plus subscriber guardrails. -- Shared shutdown supervision across API, inclusion lane, and broadcaster. -- Paged replay/catch-up in inclusion lane and broadcaster to avoid unbounded startup memory growth. -- Persisted `safe_block` frontier model for frames, with leading direct inputs materialized when opening a new frame. - -### Benchmarks & Tooling - -- Benchmark harnesses in `tests/benchmarks/` for ack latency, round-trip latency, sweeps, and unit hot path. -- Baseline reporting for p50 / p95 / p99, throughput, and RSS trends. -- Same-host benchmark workflows and docs aligned with the current runtime/config model. - ---- - -## MVP Scope (Remaining) - -### 1) Sequencer Core - -- Implement direct-input reader from blockchain (ingests into `safe_inputs`). -- Implement batch submitter (reads closed batches and submits on-chain). -- Implement inclusion fee estimator module that updates the suggested fee in DB (`batch_policy`, e.g. `gas_price` or related knobs). -- Add paginated historical `L2Tx` sync endpoint so lagging readers can backfill over HTTP before switching to `/ws/subscribe` for live updates. -- Keep storage/replay semantics deterministic and catch-up-safe as direct-input ingestion, batch submission, and recovery flows land. - -### 2) Recovery / Canonicality - -- Define how canonical progress is derived from persisted facts so replay stays deterministic. -- Detect when scheduler/canonical execution invalidates previously closed batches. -- Define the recovery procedure when persisted batches are invalidated: - - fail fast if the persisted state is inconsistent with canonical inputs - - rebuild or flush invalidated batches before resuming normal service - - notify readers when batches are invalidated - - notify readers when batches become final on-chain - -### 3) Canonical App / Scheduler - -- Implement scheduler behavior in `examples/canonical-app` using shared `sequencer-core` + `examples/app-core`. -- Ensure deterministic ordering model compatible with persisted sequencer order. -- Keep the canonical app as the state-transition artifact used by verification flow (Cartesi Machine / RISC-V path), not by sequencer runtime itself. -- Add focused tests for queue/drain/backstop behavior and ordering invariants. - -### 4) Benchmarks & Evaluation - -- Add canonical network-aware benchmark runs (client/server on different hosts or with injected latency/jitter). -- Turn target evaluation into a real pass/fail mode for the canonical network profile, not just same-host comparison. -- Tune queue / broadcaster / buffer sizing from benchmark evidence instead of ad hoc guesses. -- Revisit inclusion-lane adaptive chunk sizing only after the baseline latency/throughput envelopes are stable. - -### 5) Client / API Ergonomics - -- Add API endpoint to query current suggested inclusion fee. -- Decide whether wallet-specific convenience endpoints belong in the sequencer or in the application/client layer: - - current nonce / tx count - - EIP-712 domain discovery -- If those helper endpoints stay in the sequencer, implement them with a clear separation between core sequencer state and wallet-app-specific state. - ---- - -## Post-MVP (Nice to Have / Dogfooding Artifacts) - -- `sdk/ts-client/`: TypeScript client library for browser/server JS callers. -- `sdk/cli/`: Rust CLI for manual tx submission and debugging flows. -- `examples/web-demo/`: browser demo app consuming `sdk/ts-client`. - -Notes: - -- These are intentionally outside MVP scope. -- Still valuable for dogfooding and contributor onboarding. diff --git a/docs/recovery/README.md b/docs/recovery/README.md new file mode 100644 index 0000000..c5bf7cf --- /dev/null +++ b/docs/recovery/README.md @@ -0,0 +1,384 @@ +# Batch Recovery + +This document describes the recovery design for the sequencer: how the system detects that batches are failing to land on L1, and how it recovers to a consistent state. The design is verified with bounded TLA+ model checking ([`preemptive.tla`](preemptive.tla)). + +See `AGENTS.md` "Batch Staleness and Recovery" for quick-reference tables and function names. + +## Runtime lifecycle at a glance + +The sequencer's recovery loop spans two process lifetimes: + +1. **In-process detection.** The `DangerDetector` polls `Storage::check_danger` on a cadence. When any non-`Safe` status fires (`L1ViewStale`, `ClosedBatchInDanger`, `TipInDanger`, or `EstimatedBatchInDanger`), the runtime converts that into `DangerDetectorExit::DangerDetected` under `RunError::Worker` and the process exits with non-zero status. +2. **External respawn.** An orchestrator (systemd, k8s, …) restarts the process. +3. **Startup dispatch.** The fresh boot runs `run_preemptive_recovery` before any writers come online: sync L1, re-run `check_danger`, then `decide_startup_action` routes to one of `Proceed`, `RecoverTip`, `FlushAndCascade`, or `Refuse`. Recovery actions run their DB mutations as single SQLite transactions; `Proceed` intentionally does no DB writes. + +The detector trip and the startup dispatch share the same `check_danger` function; the detector cares only that *some* arm fired, while the startup dispatch examines *which* arm fired to pick the right action. + +Key abstractions, by responsibility: + +- **`DangerDetector`** ([`recovery/detector.rs`](../../sequencer/src/recovery/detector.rs)): tiny background task that calls `Storage::check_danger` on a cadence. Never writes to the DB, never talks to L1. Exits with `DetectorExit::RecoveryRequired` when any non-`Safe` status fires. The runtime converts that into a `DangerDetectorExit::DangerDetected` worker exit and the process exits. The dispatch difference between statuses only matters at the next startup, where `decide_startup_action` re-runs `check_danger` and routes accordingly. +- **`BatchSubmitter`** ([`l1/submitter/worker.rs`](../../sequencer/src/l1/submitter/worker.rs)): makes L1 progress only — never checks danger. Productive ticks re-enter immediately; idle/transient ticks sleep `idle_poll_interval`. A pure `decide_submit_start` function folds observed L1 nonces over the scheduler-accepted frontier. +- **`decide_startup_action`** ([`recovery/mod.rs`](../../sequencer/src/recovery/mod.rs)): pure function. Takes `danger` and returns `Proceed | RecoverTip { batch_index } | FlushAndCascade { batch_index } | Refuse(reason)`. L1 reachability is an execution concern: if the flush path cannot reach L1, startup fails and the orchestrator retries. +- **`MempoolFlusher`** ([`recovery/flusher.rs`](../../sequencer/src/recovery/flusher.rs)): submits no-op transactions to consume all pending wallet-nonce slots and waits for safe finality. Does **not** retry internally on provider errors — the orchestrator's respawn loop is the retry mechanism. +- **`ProtocolTiming`** ([`sequencer-core/src/protocol.rs`](../../sequencer-core/src/protocol.rs)): single source of truth for scheduler timing (`max_wait_blocks`) plus the sequencer-local tuning knobs (`preemptive_margin_blocks`, `l1_read_stale_after_blocks`, `seconds_per_block`). The batch-submitter address is deployment identity and is passed separately to `scheduler_accepts`. + +All five pieces are replaceable at the abstraction boundary: the tick decision is a pure function; the storage surface returns structs, not ad-hoc tuples; the danger detector and submitter are independently testable. + +## The Batch Tree + +Batches form a tree where each node is a batch and edges point from child to parent. Each batch has a single parent: the preceding batch in the valid chain. + +Batches have two identifiers: + +- **Index** (`batch_index`): monotonically increasing, unique, never reused. Creation order. +- **Nonce** (`batch_nonce`): depth of the node in the tree. Assigned by the batch submitter to valid closed batches. + +In normal operation the tree degenerates into a list -- index and nonce increase in lockstep. Branches appear only after recovery, when a suffix of the chain is invalidated and a new batch forks from the last valid ancestor. + +There is always exactly one **valid path** (root to leaf) that constitutes the current batch chain. The valid path splits into a **prefix** (safe on L1, accepted by the scheduler) and a **suffix** (pending or confirming). + +### Genesis sentinel (nonce-0 edge case) + +Recovery requires at least one Gold ancestor (the cascade invalidates a suffix and forks from the last Gold batch). If the very first batch (nonce 0) goes stale before any batch becomes Gold, there is no ancestor to fork from. + +The TLA+ model handles this with a **genesis sentinel**: the initial state starts with a Gold batch at nonce 0. This is a modeling technique that eliminates the nonce-0 special case, allowing Resolve to use uniform logic (the `fng > 1` guard is always satisfied). Without it, the model would need a separate Resolve action with different arithmetic for the "no Gold ancestor" case. + +The implementation can handle the nonce-0 case either by submitting a sentinel batch at first startup, or by special-casing the recovery code for the "no Gold ancestor" branch. + +## Coloring + +Every batch on the valid path has exactly one color. Dead branches are lead (permanently invalid). + +### Simplified model (three colors) + +| Color | Meaning | Terminal? | +|------------|----------------------------------------------------------------|-----------| +| **Gold** | Safe on L1 and accepted by the scheduler | Yes | +| **Silver** | Valid, optimistically executed, but not yet safe/accepted | No | +| **Lead** | Invalid (has `batches.invalidated_at_ms` set) | Yes | + +Gold batches form a contiguous prefix of the valid path. Silver batches form a contiguous suffix (after the gold prefix up to the open batch). Lead batches hang off gold nodes as dead branches -- the first lead in any cascade always has a gold parent. + +### Extended model (five colors) + +To model the full lifecycle including L1 submission: + +| Color | Meaning | Has `w_nonce`? | +|-------------|--------------------------------------------------------|----------------| +| **Tip** | Open batch, not yet closed | No | +| **Pending** | Closed, may or may not be submitted to mempool | Maybe | +| **Bronze** | Included in an L1 block, block not yet safe | Yes | +| **Silver** | Included, block has reached safe finality | Yes | +| **Gold** | Safe, accepted and executed by the scheduler | Yes | + +The spine ordering invariant: `Gold* Silver* Bronze* Pending* Tip` + +A Pending batch may have a `w_nonce` (submitted to the L1 mempool but not yet included in a block) or not (not yet submitted). The batch submitter assigns `w_nonce`s to all unsubmitted Pending batches at once, in spine-position order. + +## Nonce Poisoning + +The scheduler maintains a single counter: "I expect batch nonce N next." + +When a batch with nonce N arrives stale, the scheduler **skips it entirely** -- no nonce increment, no state change, no report. It is a true noop in nonce-space. + +This poisons the nonce counter. Every subsequent batch (nonce N+1, N+2, ...) is dead on arrival. Not because they are individually stale, but because the scheduler still expects nonce N. The only batch with nonce N was stale and skipped, so the counter will never advance past N. + +Cascade invalidation is therefore **exact, not conservative**. The sequencer's `WHERE batch_index >= stale_batch_index` mirrors precisely what the scheduler will do (refuse). The entire silver suffix is unreachable once any batch in it is stale. + +Recovery is the only way forward: create a new batch with nonce N, giving the scheduler what it needs to resume. + +## Two Staleness References + +The staleness formula is `reference_block - first_frame_safe_block >= MAX_WAIT_BLOCKS`, but the reference block differs by context: + +### Inclusion staleness (scheduler's perspective) + +``` +inclusion_block - first_frame_safe_block >= MAX_WAIT_BLOCKS +``` + +Used by `populate_safe_accepted_batches` to simulate what the scheduler accepts. Each batch has its own inclusion block (the L1 block where its submission landed). **Not monotonic** across batches -- a promptly submitted old batch can be healthy while a late-submitted newer batch is stale. + +Inclusion staleness determines the **gold frontier**: the set of batches the scheduler has accepted. + +### Current staleness (sequencer's detection) + +``` +current_safe_block - first_frame_safe_block >= MAX_WAIT_BLOCKS +``` + +Used by the danger threshold detector. The reference block (`current_safe_block`) is the same for all batches. **Monotonic within the valid path** -- earlier batches have smaller `first_frame_safe_block`, so larger difference. If the frontier batch is not stale by this measure, no batch is. + +Current staleness triggers **preemptive recovery** (see below). + +## Nonce Uniqueness on the Valid Path + +`batches.nonce` can repeat across the full table -- a recovery batch inherits `parent.nonce + 1` from the last valid ancestor, which is the same nonce the first invalidated suffix batch had. Among **valid batches** (those with `invalidated_at_ms IS NULL`), nonces are unique because the valid path is a strict chain via `parent_batch_index`. + +This matters because L1 works in nonce-space (the scheduler identifies batches by nonce) while the sequencer works in index-space (local `batch_index`). The recovery path needs to translate between them: "which batch indexes should we invalidate?" Nonce uniqueness on the valid path is what makes this mapping unambiguous. + +## The L1 Stream + +L1 processes transactions in `w_nonce` order. At each slot (a given `w_nonce` value), exactly one transaction is included. If multiple transactions compete for the same slot (e.g., a dead batch and a flush no-op), L1 non-deterministically picks one. The loser is discarded. + +This is the interface between the sequencer and the scheduler. The scheduler sees a stream of entries ordered by `w_nonce`, each with a `batch_nonce`, `inclusion_block`, and `safe_block`. It processes them in order, accepting or rejecting based on nonce match and staleness. + +## The Uncertainty Interval + +The core insight behind the recovery design is that **mempool uncertainty is bounded by a time interval**. + +Once a batch's `safe_block` is old enough that `current_safe_block - safe_block >= MAX_WAIT_BLOCKS`, we know it is stale no matter when it lands on L1 (because `inclusion_block >= current_safe_block`). Any batch in the mempool with that `safe_block` is dead-on-arrival. This means mempool uncertainty has a natural expiration: after `MAX_WAIT_BLOCKS`, the L1 outcome doesn't matter. + +This gives us three regimes: + +``` +|---------- safe ----------|-- danger zone --|-- past MAX_WAIT --| + no action flush + recover self-resolved +``` + +- **Before the danger zone**: batches are young. Nothing to do. +- **In the danger zone**: batches might land stale, or might still make it. This is the window of uncertainty. For **closed unresolved batches**, the flush resolves it by forcing every `w_nonce` slot to finalize (batch wins or no-op wins). After the flush, the sequencer reads the scheduler's finalized state and cascades if needed. An **open Tip** has no `w_nonce` slot yet, so it is not part of this uncertainty set. +- **Past MAX_WAIT**: all unresolved batches are guaranteed stale by L1 monotonicity (`inclusion_block >= current_safe_block >= safe_block + MAX_WAIT`). For closed unresolved batches, the L1 outcome no longer matters because every eventual inclusion is stale, but wallet-nonce slots may still need to be flushed (or naturally consumed) before recovery can reconstruct the scheduler frontier. For an aging open Tip, there is no L1-slot uncertainty at all, so startup recovery can invalidate it directly. + +**What TLA+ proves vs external reasoning**: the TLA+ model ([`preemptive.tla`](preemptive.tla)) proves that after all `w_nonce` slots are resolved (however that happens), ZombieSafety holds. It does not model the danger threshold or the passage of time. The claim that "past MAX_WAIT, staleness self-resolves" is an external argument from L1 monotonicity (`inclusion_block >= current_safe_block`), not something TLA+ checks. + +Any recovery design must wait out this uncertainty. The question is how. The preemptive design (implemented here) forces resolution by going offline and flushing. An alternative optimistic design lets the uncertainty resolve naturally but keeps serving soft confirmations -- see [`history/`](history/) for that approach and why we preferred preemptive. + +## Silver-Only for Submitted Batches + +The Silver-only constraint applies to **submitted batches whose L1 slot outcome is still relevant**. This is the zombie path, and it is where the optimistic-design counterexample from [`history/`](history/) still matters. + +A Silver batch's L1 entry is permanent -- no mempool competition can kill it. The scheduler **will** see it, at a `w_nonce` lower than any recovery batch, and be poisoned. This ordering guarantee is what makes nonce poisoning reliable. + +Detecting staleness on Pending or Bronze submitted batches *before wallet-nonce uncertainty is resolved* is unsafe: a recovery batch can take the frontier's L1 slot via wallet-nonce mutual exclusion, preventing the scheduler from ever seeing the stale frontier, and allowing non-frontier dead batches to pass the nonce check. TLA+ model checking found this bug; see [`history/`](history/) for the counterexample. + +The open Tip is different. It has no L1 transaction yet, so there is no `w_nonce` competition and no zombie risk. Once `current_safe_block - first_frame_safe_block >= danger_threshold`, startup recovery can invalidate the aging Tip directly and open a fresh one. Likewise, after a preemptive flush has resolved all competing `w_nonce` slots for closed batches, the atomic recovery transaction can safely use **current staleness** on the oldest unresolved batch (closed or open). + +## Preemptive Recovery Design + +The sequencer uses a preemptive approach: detect danger early, go offline, flush the mempool, then recover on solid ground. This design was preferred over the optimistic alternative because it is simpler to reason about and produces fewer invalidated soft confirmations (the sequencer stops issuing them before the cascade). + +### Step 1: Danger threshold + +Define `DANGER_THRESHOLD = MAX_WAIT_BLOCKS - MARGIN`. When the frontier batch's current staleness (`current_safe_block - safe_block`) reaches `DANGER_THRESHOLD`, **trigger preemptive recovery**. + +The threshold is *only* a trigger. It says "stop running, hand off to recovery." It does **not** say "this batch is doomed." The cascade decision belongs to step 5, which examines the post-flush state and acts on what's actually there. + +#### Why a margin at all (Sorites argument) + +The right value of `MARGIN` is not derived from the recovery procedure's runtime — it falls out of a sharper question: **at what age do we give up on the current batches and start anew?** + +Two endpoints are clear: + +- A batch that's 1 minute behind shouldn't be invalidated. The infra hiccup might pass; pre-confirmations issued against it will likely still land. +- A batch that's 1 minute *before* `MAX_WAIT_BLOCKS` shouldn't be left to die. We've already tried for hours. The last minute won't save us, and pre-confirmations issued in this window are knowingly dishonest — we have strong evidence they won't land. + +Somewhere between those, we want to switch from "keep waiting" to "give up." The exact crossover is a Sorites question with no canonical answer, but two design pressures pin it: + +1. **Stop issuing pre-confirmations on state we reasonably know won't land.** As current staleness approaches `MAX_WAIT_BLOCKS`, the probability that the current batch lands gracefully drops. Pre-confs issued past that point are increasingly dishonest to users. +2. **Give the operator runway to fix infra.** If L1 is misbehaving, network is degraded, mempool is congested — the operator needs hours, not minutes, to diagnose and act before the system commits to recovery and invalidates work. + +The recovery procedure's own runtime (flush submission + L1 safe finality wait of ~13 min on Ethereum + atomic SQLite cascade) is a *floor* on `MARGIN`, not the deciding factor. It must fit, but fitting it is far from the operating point. + +#### Defaults + +With `MAX_WAIT_BLOCKS = 1200` (~4 hours), the default `MARGIN = 300` blocks (~1 hour at 12s/block) gives the operator ~1 hour after danger-zone entry before the system commits to recovery. That's well above the procedure-runtime floor (~15 min) and meaningful runway under the second design pressure. + +Production tunings with a longer `MAX_WAIT_BLOCKS` (e.g. 24h) should keep the margin in the hours range — there's no benefit to a tighter margin once `MARGIN` exceeds the procedure-runtime floor several times over. + +### Step 2: Go offline + +Stop accepting new user operations. From the outside world, the sequencer is temporarily unavailable. This eliminates concurrent batch creation during recovery. + +### Step 3: Flush mempool + +Query the latest confirmed `w_nonce` (N) and the pending `w_nonce` (M). Submit `M - N` no-op transactions (e.g., self-transfer of 0 ETH) at nonces N, N+1, ..., M-1. These compete with any batches in the mempool at the same slots. + +Wait for all `M - N` slots to reach L1 safe finality. + +### Step 4: Post-flush state + +Every `w_nonce` slot from N to M-1 is now resolved: + +- **Batch won**: the batch is on L1 and safe (Silver or Gold) +- **No-op won**: the batch is dead forever, its slot consumed + +There are no more mempool entries. All uncertainty is resolved. + +**Flush safety does not depend on eviction.** A no-op may fail to evict a still-pending batch tx (e.g. our local node rejects the replacement under EIP-1559's ≥10% bump rule). That's fine: the outer `flush_and_wait` loop is unbounded — it keeps running until `pending ≤ safe`, and *eventual* inclusion of either the original batch tx or the no-op resolves the slot. Safety holds regardless of which lands; eviction is only an operational efficiency concern. + +### Step 5: Run recovery + +This is an atomic SQLite transaction operating on the best available L1 state. The storage work splits cleanly by whether a flush ran first. + +#### Mental model: "everything past gold is doomed" + +After the flush has resolved every wallet-nonce slot, and `populate_safe_accepted_batches` has been re-synced, the gold spine is at its **maximum extent**: the simulation walked safe-inputs in inclusion order, accepting each one until it hit a barrier (a stale batch, or a missing batch where a no-op consumed the slot). + +Any batch past that gold frontier is **doomed**, in one of three concrete senses: + +| State | What happened | Why doomed | +|---|---|---| +| **Silver-stale** | Original tx landed, scheduler skipped (`inclusion_block - first_frame ≥ MAX_WAIT`) | Scheduler's expected nonce never advances past it; downstream batches are nonce-poisoned | +| **Silver-fresh poisoned** | Original tx landed fresh, but a preceding stale or missing batch poisoned the nonce | Scheduler skipped on nonce mismatch; on-chain row can't be retroactively re-evaluated | +| **Pending (no-op'd)** | Flush no-op consumed the wallet-nonce slot; original tx never landed | The L1 transaction is dead. Re-submission at a fresh slot would land *after* the existing on-chain Silver-poisoned batches; the scheduler sees those at lower `safe_input_index`, advances expected past them on the resub generation, but the per-original-tx work is gone | + +**Why isn't this just "stale"?** Under self-trust (we don't defend against malformed self-submissions), the *first* non-gold closed batch can only be Silver-stale or Pending. Nonce-mismatch is impossible at the frontier — nonces are contiguous on the valid path (`trg_enforce_nonce_contiguity`). But *downstream* batches past that first non-gold are typically Silver-fresh-poisoned: their inclusion-staleness was fine, but they were processed when expected was stuck at the poisoned nonce. + +Cascading from the first non-gold catches all three. **No per-batch age check is needed for the cascade pivot itself** — every closed batch past gold is doomed by construction. + +#### Path A — `recover_post_flush(danger_threshold)` (called from FlushAndCascade) + +After step 3 (flush) and step 4 (re-sync), the gold frontier is fresh. Run the atomic recovery transaction: + +1. **Find the cascade pivot.** First try the closed pivot: first valid closed batch with `nonce >= frontier_nonce`. By the contiguity invariant, this batch's nonce is exactly `frontier_nonce`. If one exists, cascade from it. +2. **No closed pivot? Check the Tip.** When all closed batches landed fresh and were accepted (the "everything worked" aftermath), there's no closed pivot — but the Tip can still be in the danger zone. When the lane rotates without a safe-block advance between frames (e.g. immediately after init, both frames share the bootstrap `safe_block`), `S_tip = S_closed`. The closed batch can become gold by inclusion-staleness while the Tip's age — measured against `current_safe_block` after the flush wait — has crossed the danger zone. Pure monotonicity (`S_tip ≥ S_closed`) doesn't rule this out: equality is allowed. So fall through to `find_tip_batch_in_danger(danger_threshold)`. If the Tip's age clears `danger_threshold`, cascade it. +3. **Cascade-invalidate the suffix**: set `invalidated_at_ms` on every valid batch with `batch_index >= pivot.batch_index`. This catches all non-gold batches in cases (2)/(3) above, and the Tip alone in the no-pivot-but-Tip-aging case. +4. **Open recovery batch**: parent is the last valid ancestor (`MAX(batch_index) FROM valid_batches` after the cascade). Nonce is structurally `parent.nonce + 1`, which equals `frontier_nonce` — the scheduler's `expected_nonce`. Re-drain direct inputs from the invalidated batches via the `MAX(safe_input_index) + 1` query over `valid_sequenced_l2_txs`. + +**Threshold = `danger_threshold`, not `MAX_WAIT_BLOCKS`**. We're already committed to recovery; the Tip is past gold; if it's also past the threshold that would have triggered recovery had it been a closed batch, cascade it. Otherwise the next danger detector tick after resume would re-trip on the Tip's eventual close + submission anyway (the closed batch would inherit its first frame's safe_block). + +#### Path B — `recover_aging_tip(danger_threshold)` (called from RecoverTip) + +The `RecoverTip` action is dispatched when `check_danger` returns `TipInDanger(idx)`: no closed batch is past the gold frontier in the danger zone, but the open Tip's first frame has aged past `danger_threshold`. **No flush ran** — the Tip has no L1 footprint, so there's nothing to flush. + +Closed batches past gold (if any) are still in their natural lifecycle — pending in the mempool, recently included, awaiting safe finality. Cascading them would prematurely abort their progression. We act only on the Tip: + +1. Run `find_tip_batch_in_danger(danger_threshold)`. If `Some(tip_index)`, cascade-invalidate from there (which only touches the Tip — no closed batches have `batch_index >= tip_index`). +2. Open a fresh recovery batch. +3. If no Tip in danger and no Tip exists at all (torn-state crash recovery), open a Tip anyway. + +The `Proceed` path does not call this function. Under that dispatch, no danger arm fired and the persisted state is left untouched; genesis Tip creation is handled by the inclusion lane's normal `initialize_open_state` path. + +#### Why `danger_threshold`, not `MAX_WAIT_BLOCKS`, for the Tip threshold + +The Tip threshold is a **policy choice**, not a mathematical staleness bound. A Tip whose first frame is at age `danger_threshold` could in principle still close, submit, and land fresh by inclusion-staleness — `inclusion_block - first_frame` would be roughly `danger_threshold + (rotation + submit latency)`, which (with a reasonable margin) is still below `MAX_WAIT_BLOCKS`. + +We invalidate at `danger_threshold` because: + +1. **Pre-confirmation honesty.** Once the Tip's age crosses the danger zone, the system has decided this generation is operationally suspect. Continuing to issue soft confirmations against it is dishonest to users. +2. **Avoid retrip risk.** The runtime danger detector also fires on `DangerStatus::TipInDanger`. Without invalidating at startup, we'd resume operation, the detector would re-trip on the next tick, and we'd cycle. Cascading at startup converges in one cycle. +3. **Symmetry with the closed-batch trigger.** The closed-batch detector trips at `danger_threshold`. Using the same threshold for the Tip preserves the framing: "danger zone = committed to recovery." + +### Step 6: Resume + +Restart the batch submitter and user-op acceptance. The sequencer is back online. + +### Why post-flush cascade is unconditional (and not threshold-based) + +An earlier design considered using `MAX_WAIT_BLOCKS` as the cascade threshold even in the post-flush path: only invalidate the frontier if its `current_safe_block - first_frame.safe_block ≥ MAX_WAIT`. The intuition was to preserve soft confirmations when re-submission could still land fresh. + +**This doesn't hold up.** Walk through the boundary case: + +1. Frontier batch has `current_staleness ∈ [danger_threshold, MAX_WAIT)`. Detector trips, flush runs. +2. `recover_post_flush` (with hypothetical threshold) sees age below MAX_WAIT, declines to cascade. Resume. +3. Submitter wakes up, resubmits the Pending frontier (and any non-gold closed batches) at fresh wallet-nonce slots. They enter the mempool. +4. Detector polls again. Frontier age has barely moved (or not at all — safe head advances at ~1 block per 12s); still above `danger_threshold`. Detector trips again. +5. Recovery 2 starts. Flush submits no-ops at the slots the submitter just used for resubs. Bumped fees on no-ops typically out-bid resubs. Resubs killed. +6. Goto step 2. Loop converges only when `current_staleness` finally crosses `MAX_WAIT_BLOCKS` and the threshold check fires. + +Each loop iteration burns gas (no-ops + doomed resubs), takes ~12 minutes (the flush's safe-finality wait), and the soft confirmations are rolled back at the end anyway. Cascading on first non-gold converges in **one cycle** with predictable cost. + +### Startup behavior summary + +The startup flow is dispatched by `decide_startup_action(danger)`: + +| `check_danger` result | Action | Recovery primitive | Why this dispatch | +|---|---|---|---| +| `Safe` | `Proceed` | none | Nothing crossed danger; leave persisted state alone and let the inclusion lane initialize genesis if the DB is fresh. | +| `L1ViewStale` | `Refuse(L1ViewStale)` | — | The L1 view is too old to support honest recovery or new soft confirmations. | +| `TipInDanger(N)` | `RecoverTip { N }` | `recover_aging_tip(danger_threshold)` (no flush — Tip has no L1 slot) | Tip has no L1 footprint; cascade and reopen directly. | +| `ClosedBatchInDanger(N)` | `FlushAndCascade { N }` | flush + `recover_post_flush(danger_threshold)` | Closed batch has L1 transactions whose fate must be resolved before cascading. | +| `EstimatedBatchInDanger(N)` | `Refuse(EstimatedBatchInDanger { N })` | — | Observed safe-state did not cross danger; only batch-relative wall-clock extrapolation did, and we don't recover from estimated state. | + +**L1 view freshness gates recovery.** `check_danger` first checks the L1 safe block timestamp against `l1_read_stale_after_blocks`. If the timestamp is missing or too old, startup refuses: the sequencer has no trustworthy L1 view from which to recover or issue new soft confirmations. With a fresh L1 view, observed-safe checks decide concrete recovery: `ClosedBatchInDanger` runs flush + cascade, while `TipInDanger` invalidates the open Tip directly. `EstimatedBatchInDanger` is the final batch-relative wall-clock fallback: observed safe-state has not crossed the threshold, but elapsed time since the last safe-head advance says the batch consumed its remaining runway, so startup refuses instead of recovering from estimated state. + +The Refuse variants block boot and surface to the operator. `Proceed` performs no recovery writes; the normal inclusion-lane startup path initializes the Tip if the DB is fresh. The mutating recovery actions each commit atomically: `RecoverTip` invalidates the aging Tip and opens a fresh one, while `FlushAndCascade` cascades the post-flush non-gold suffix and opens a fresh Tip when needed. + +**What TLA+ proves here**: the model still abstracts away the full startup cutover/flush decision. It proves ZombieSafety once wallet-nonce slots resolve, and separately models direct recovery of an aging open Tip. The claim that past `MAX_WAIT`, closed-batch staleness self-resolves is external reasoning from L1 monotonicity. The post-flush "cascade everything past gold" choice is also external reasoning (the "everything past gold is doomed" mental model above). + +### Startup observability + +Startup recovery logs the decision and outcome with stable structured fields: + +- `danger_status` — `safe`, `l1_view_stale`, `closed_batch_in_danger`, `tip_in_danger`, or `estimated_batch_in_danger`. +- `danger_batch_index` — set for batch-specific danger statuses. +- `startup_action` — `proceed`, `recover_tip`, `flush_and_cascade`, or `refuse`. +- `refuse_reason` — present on refusal. +- `l1_reachable`, `danger_threshold`, `max_wait_blocks`, and `l1_read_stale_after_blocks` on the decision log. +- `invalidated_count` on the completion log, plus `batches` when any batch was invalidated. + +The orchestrator should still be the source of restart-loop policy and alert routing, but it should not need to parse free-form messages to distinguish "refused because the L1 view is stale" from "recovering a Tip" or "running flush + cascade." + +### L1 view freshness + +The safety policy does not branch directly on a provider-reachability boolean. Reachability is an execution concern: startup tries to sync the safe head, and if `FlushAndCascade` later cannot reach L1, the flusher errors and the orchestrator retries. The decision primitive is the freshness of the L1 view recorded in SQLite. + +The most common real-world trigger for `L1ViewStale` is a stalled RPC gateway: the provider answers, but its safe-head response stops advancing (a degraded upstream node, a load-balancer routing to a lagging replica, or a temporary indexing pause). The sequencer can't distinguish "fresh answer from a stalled view" from "L1 itself is unhealthy" without a second source of truth, so it treats both the same way: refuse to commit to soft confirmations until the recorded safe block is fresh again. + +**At startup**: the sequencer attempts to sync the safe head from L1. Whether that succeeds or fails, it then checks the persisted safe block timestamp. If the timestamp is missing or older than `l1_read_stale_after_blocks * seconds_per_block`, `check_danger` returns `L1ViewStale` and startup refuses. If the view is fresh, observed-safe checks can route to recovery, and the batch-relative wall-clock estimate remains as a final refusal guard for unresolved batches whose safe-block age has effectively crossed the danger threshold. + +**At runtime**: the `DangerDetector` polls `Storage::check_danger` on its cadence. The input reader records both the observed safe block timestamp and the local time at which the safe head last advanced. If safe-head observations stop advancing, either the global safe block timestamp crosses the read-staleness threshold (`L1ViewStale`) or a specific unresolved batch crosses the batch-relative adjusted threshold (`EstimatedBatchInDanger`). The detector then exits with `RecoveryRequired`, the orchestrator respawns, and startup re-runs the same check. The batch submitter never observes danger; this responsibility lives entirely with the detector. + +**Other workers during L1 outages**: the inclusion lane and API are purely local (SQLite) and continue operating. The input reader retries L1 polling with error logging. All L1-dependent workers log errors at the `error` level to alert operators. + +The `seconds_per_block` parameter (default: 12 for Ethereum) is configurable via `SEQ_SECONDS_PER_BLOCK`. The L1 read-staleness threshold is configurable via `SEQ_L1_READ_STALE_AFTER_BLOCKS`; if unset, startup derives it before the write danger threshold. These estimates are conservative — they may cause earlier detection if blocks are slower than assumed. This is correct: better to crash early than to issue doomed soft confirmations. + +## Dead Batches + +After cascade invalidation, submitted Pending batches (those with `w_nonce` assigned) are **dead batches**. They are still in the L1 mempool, competing with their flush no-op transactions. + +Two outcomes per dead batch, non-deterministic: + +- **Dead batch beats no-op**: lands on L1, scheduler sees it, rejects it (stale by inclusion, or nonce-poisoned by a preceding stale/missing batch) +- **No-op beats dead batch**: dead batch killed forever, scheduler never sees it (the scheduler skips the gap) + +A killed batch acts as **silent nonce poison**: the scheduler never sees it, so `schedulerExpected` stays stuck at its `batch_nonce`. All subsequent batches have wrong nonces. + +Dead batches occupy `w_nonce` slots strictly below `walletNonce`. Recovery batches occupy `w_nonce` slots at or above `walletNonce`. **No overlap.** This is why no mutual exclusion is needed between dead batches and recovery batches -- they live in non-overlapping `w_nonce` ranges. + +## Implementation Constraints + +These constraints were discovered during TLA+ model checking and are required for correctness: + +1. **`walletNonce` must NOT be reset during recovery.** Recovery batches must use `w_nonces` strictly past all dead batch slots. The flush consumes dead batch slots by advancing `nextL1Slot` up to `walletNonce`. Recovery starts fresh from there. + +2. **`SubmitBatch` must use `max(walletNonce, nextL1Slot)`.** Prevents assigning `w_nonce` values for slots L1 has already consumed. + +3. **`SubmitBatch` must assign ALL pending batches at once, in spine-position order.** If batches are submitted individually, a flush-win can bump one batch's `w_nonce` past a later batch's, violating the spine ordering invariant. + +4. **Wall-clock freshness when the L1 view stops advancing.** The input reader records the L1 safe block timestamp and the local last-safe-head-progress time. `Storage::check_danger` first refuses on an old or missing safe block timestamp, then uses the local progress timestamp to estimate unresolved-batch age (`elapsed / seconds_per_block`). Without these checks, an L1 outage can silently push batches past the danger zone while the DB-based safe-block number remains frozen. + +5. **The accepted-frontier cache persists acceptances, not scan progress.** `safe_accepted_batches` stores the scheduler-accepted prefix and resumes from the latest accepted safe input. Rejected batch-submitter inputs after that frontier can be rescanned on later safe-head syncs until a later batch is accepted. This is a performance tradeoff, not a correctness bug: recovery batches can reuse a scheduler nonce after earlier rejected rows, so a separate persistent scan cursor would need careful nonce-reuse tests before being introduced. + +## Formal Verification + +The recovery design is verified with bounded TLA+ model checking. The canonical spec is [`preemptive.tla`](preemptive.tla). An alternative optimistic design is preserved in [`history/optimistic.tla`](history/optimistic.tla). + +**Scope and limitations**: these are bounded safety models. They exhaustively check all reachable states within the configured bounds, but do not prove liveness (eventual progress), do not model the danger threshold trigger or timing margins, and do not model crash/restart (the implementation relies on SQLite atomic transactions for crash safety). + +### `preemptive.tla` -- Slot-level safety under adversarial flush + +Models the core slot-level mechanics of preemptive recovery. At every `w_nonce` slot, L1 non-deterministically includes the spine batch OR a flush no-op (killing the batch). This covers the case where the frontier batch itself is killed during flush. The model also treats the open Tip's `safe_block` as meaningful, so it can explicitly recover an aging Tip that has no L1 footprint yet. + +The model is a **safety over-approximation**: it allows `AdvanceTip` and `SubmitBatch` to interleave freely with recovery, which the real protocol prevents (the sequencer goes offline). This makes the proof stronger -- if `ZombieSafety` holds under more interleavings, it holds under fewer. However, the model does not verify the full sequential protocol phases (cutover, flush, wait, recover, resume) described above; in particular, the startup decision of whether a closed unresolved batch must flush before recovery remains an external argument layered on top of the slot-level proof. + +**Verified**: 157M states, 0 violations. + +| Invariant | Meaning | +|-----------|---------| +| ZombieSafety | `schedulerExpected = CountGold(spine)` -- scheduler accepts exactly the Gold prefix | +| BatchNoncesContiguous | Batch nonces are 0..N-1 for non-Tip spine | +| InvalidOnlyOnGold | Dead branches only hang off Gold nodes | +| L1WNonceUnique | No two L1 entries share a `w_nonce` | +| L1BeforeCursor | All L1 entries have `w_nonce < nextL1Slot` | +| SchedulerBehindL1 | Scheduler cursor doesn't pass L1 cursor | +| DeadNotYetIncluded | Dead batches have `w_nonce >= nextL1Slot` | + +### Running the spec + +```bash +tlc -workers auto -deadlock docs/recovery/preemptive.tla # ~90s +``` + +Bounds are in `preemptive.cfg`. The `MaxWalletNonce` bound keeps the state space finite (kill/resubmit cycles generate new `w_nonce` values). Increase bounds for higher confidence at the cost of longer runtime. diff --git a/docs/recovery/history/README.md b/docs/recovery/history/README.md new file mode 100644 index 0000000..74e173e --- /dev/null +++ b/docs/recovery/history/README.md @@ -0,0 +1,56 @@ +# Recovery Design History + +This directory preserves the optimistic recovery design -- an alternative to the preemptive approach documented in the parent [`README.md`](../README.md). Both designs are sound. We preferred preemptive for its operational properties. + +## The Optimistic Design + +In the optimistic design, the sequencer keeps accepting user operations and building batches while recovery plays out in the background. If a batch goes stale, the system detects it when the batch becomes Silver (safe on L1), cascade-invalidates, and submits recovery batches -- all while the sequencer continues serving soft confirmations. + +The TLA+ spec [`optimistic.tla`](optimistic.tla) models this design with a scheduler, wallet nonces, zombie batches (invalidated batches still in the L1 mempool), and adversarial L1 inclusion. At each `w_nonce` slot where a zombie and a recovery batch compete, L1 non-deterministically picks one (wallet-nonce mutual exclusion). + +**Verified**: 194M states, 0 violations (after the Silver-only fix below). + +## The Silver-Only Constraint + +Both designs share a critical constraint: **recovery must wait for the frontier batch to be Silver before cascade-invalidating.** + +This constraint was discovered through the optimistic model. The original design allowed staleness detection on Pending or Bronze batches (a "short-circuit" for faster recovery). TLA+ found a counterexample: + +Three batches with `MAX_WAIT_BLOCKS = 2`: + +``` +batch bn=0 bn=1 bn=2 +sb 0 0 1 +wn 0 1 2 +``` + +With `currentSafeBlock = 2`, `bn=1` is stale by current block, `bn=2` is fresh. If we cascade from `bn=1`, both become zombies. Recovery creates a new `bn=1` at `wn=1`. + +At L1 slot 1, zombie `bn=1` and recovery `bn=1` compete (same `w_nonce`): + +- **Zombie wins**: scheduler sees it, stale, skip. Nonce poisoned. Safe. +- **Recovery wins**: zombie `bn=1` dies (never reaches L1). Recovery accepted. `schedulerExpected` advances to 2. Zombie `bn=2(wn=2)` is fresh (`inclusion_block - safe_block = 1 < 2`), matches expected nonce -> **accepted**. The scheduler executes invalidated batch data. + +The two protection layers (wallet-nonce mutual exclusion and nonce poisoning) undercut each other: mutual exclusion kills the batch that nonce poisoning needs. + +The fix: only detect staleness when the frontier is Silver (safe on L1, immutable). The scheduler is guaranteed to see it before any recovery batch. + +## Why We Chose Preemptive + +Both designs are sound once Silver-only detection is enforced. The difference is operational: + +**Both designs wait.** Any recovery design must wait for the frontier to become Silver before cascading. In the optimistic design, the sequencer keeps issuing soft confirmations during this wait -- confirmations that will be invalidated when the cascade fires. In the preemptive design, the sequencer goes offline before the cascade, so no doomed soft confirmations are issued. + +**Preemptive is simpler to reason about.** The optimistic design has concurrent actors: the batch submitter, the inclusion lane, L1 mempool competition, and recovery all interleave. The preemptive design is sequential: stop, flush, recover, resume. Each step has clear preconditions and postconditions. + +**Preemptive eliminates mempool races.** The flush resolves all `w_nonce` slot uncertainty before recovery runs. Recovery operates on fully-finalized L1 state. No zombie mutual exclusion needed. + +**The cost is downtime.** Preemptive recovery takes the sequencer offline for the duration of the flush + safe finality wait (~15-20 minutes on Ethereum). For a rare event (a batch approaching the 4-hour staleness deadline), this is acceptable. + +## Running the Spec + +```bash +tlc -workers auto -deadlock docs/recovery/history/optimistic.tla # ~3min +``` + +Bounds are in `optimistic.cfg`. diff --git a/docs/recovery/history/optimistic.cfg b/docs/recovery/history/optimistic.cfg new file mode 100644 index 0000000..1bb370c --- /dev/null +++ b/docs/recovery/history/optimistic.cfg @@ -0,0 +1,9 @@ +SPECIFICATION Spec + +CONSTANTS + MaxBatchIndex = 6 + MaxSafeBlock = 7 + MAX_WAIT_BLOCKS = 2 + +INVARIANTS + Inv diff --git a/docs/recovery/history/optimistic.tla b/docs/recovery/history/optimistic.tla new file mode 100644 index 0000000..8340ae5 --- /dev/null +++ b/docs/recovery/history/optimistic.tla @@ -0,0 +1,460 @@ +---------------------------- MODULE optimistic ----------------------------- +(* + * Formal model of sequencer batch tree with scheduler, wallet nonces, + * zombie batches, and adversarial L1 inclusion. + * + * Proves: ZombieSafety == schedulerExpected = CountGold(spine) + * + * After recovery, no zombie batch from an invalidated chain is ever + * accepted by the scheduler. + * + * Colors (spine ordering): Gold* Silver* Bronze* Pending* Tip + * - Tip: open batch (not yet closed) + * - Pending: closed, may have w_nonce (submitted to L1 mempool) + * - Bronze: included in an L1 block (not yet safe) + * - Silver: included in a safe L1 block + * - Gold: accepted by the scheduler + * + * Key mechanism — two-layer zombie protection: + * (1) Wallet nonce mutual exclusion: zombie and recovery batch compete + * for the same L1 slot. Loser's w_nonce is bumped. + * (2) Nonce poisoning: stale batch is a no-op in the scheduler (does + * not increment expected nonce), making all subsequent zombies + * have wrong batch_nonce. + * + * Actions: + * AdvanceTip -- close tip -> Pending, append new Tip + * SubmitBatch -- assign w_nonce to first unsubmitted Pending + * L1Include -- include tx at nextL1Slot (spine or zombie wins) + * AdvanceSafeBlock -- L1 safe block advances, Bronze -> Silver + * SchedulerStep -- scheduler processes next safe L1 entry + Gold + * Resolve -- detect staleness, cascade, create zombies + * + * See docs/recovery.md for the conceptual model. + *) + +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxBatchIndex, \* bound on total batch creations + MaxSafeBlock, \* bound on L1 safe block + MAX_WAIT_BLOCKS \* staleness threshold + +NONE == -1 \* sentinel: "no w_nonce assigned" + +--------------------------------------------------------------------------- +(* Colors *) + +Gold == "Gold" +Silver == "Silver" +Bronze == "Bronze" +Pending == "Pending" +Tip == "Tip" + +Colors == {Gold, Silver, Bronze, Pending, Tip} + +ColorOrd(c) == + CASE c = Gold -> 0 + [] c = Silver -> 1 + [] c = Bronze -> 2 + [] c = Pending -> 3 + [] c = Tip -> 4 + +--------------------------------------------------------------------------- +(* Variables *) + +VARIABLES + spine, \* Seq of [index, color, safe_block, inclusion_block, + \* w_nonce, batch_nonce] + invalid, \* Seq of Nat: dead-branch count per spine position + nextIndex, \* Nat: next batch index + currentSafeBlock, \* Nat: L1 safe block (environment) + walletNonce, \* Nat: next w_nonce for mempool submission + zombies, \* Set of [batch_nonce, w_nonce, safe_block] + nextL1Slot, \* Nat: L1 nonce cursor (next w_nonce to include) + l1Included, \* Set of [batch_nonce, w_nonce, inclusion_block, + \* safe_block, is_safe] + schedulerCursor, \* Nat: next w_nonce the scheduler will process + schedulerExpected \* Nat: scheduler's expected batch nonce + +vars == <> + +--------------------------------------------------------------------------- +(* Helpers *) + +CountGold(s) == Cardinality({i \in 1..Len(s) : s[i].color = Gold}) + +FirstNonGold(s) == + IF \E i \in 1..Len(s) : s[i].color # Gold + THEN CHOOSE i \in 1..Len(s) : + s[i].color # Gold /\ \A j \in 1..i-1 : s[j].color = Gold + ELSE 0 + +\* First Pending without a w_nonce. +FirstUnsubmitted(s) == + IF \E i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = NONE + THEN CHOOSE i \in 1..Len(s) : + s[i].color = Pending /\ s[i].w_nonce = NONE + /\ \A j \in 1..i-1 : ~(s[j].color = Pending /\ s[j].w_nonce = NONE) + ELSE 0 + +\* Spine position of Pending batch with a given w_nonce. +PendingAtWNonce(s, wn) == + IF \E i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + ELSE 0 + +\* Spine position of Silver batch with a given batch_nonce. +SilverAtBN(s, bn) == + IF \E i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + ELSE 0 + +--------------------------------------------------------------------------- +(* Staleness *) + +IsStaleByInclusion(b) == b.inclusion_block - b.safe_block >= MAX_WAIT_BLOCKS +IsStaleByCurrentBlock(b) == currentSafeBlock - b.safe_block >= MAX_WAIT_BLOCKS + +--------------------------------------------------------------------------- +(* Invariants *) + +TypeOK == + /\ Len(spine) >= 1 + /\ nextIndex \in Nat + /\ currentSafeBlock \in Nat + /\ walletNonce \in Nat + /\ nextL1Slot \in Nat + /\ schedulerCursor \in Nat + /\ schedulerExpected \in Nat + +\* Gold* Silver* Bronze* Pending* Tip +SpineOrdering == + /\ spine[Len(spine)].color = Tip + /\ \A i \in 1..Len(spine)-1 : + ColorOrd(spine[i].color) <= ColorOrd(spine[i+1].color) + +SafeBlockMonotonic == + \A i \in 1..Len(spine)-1 : + (spine[i].color # Tip /\ spine[i+1].color # Tip) + => spine[i].safe_block <= spine[i+1].safe_block + +InvalidOnlyOnGold == + \A i \in 1..Len(spine) : invalid[i] > 0 => spine[i].color = Gold + +CurrentStalenessMonotonic == + \A i, j \in 1..Len(spine) : + (i < j /\ spine[i].color # Tip /\ spine[j].color # Tip + /\ IsStaleByCurrentBlock(spine[j])) + => IsStaleByCurrentBlock(spine[i]) + +BatchNoncesContiguous == + \A i \in 1..Len(spine) : + spine[i].color # Tip => spine[i].batch_nonce = i - 1 + +\* ------- THE KEY THEOREM ------- +ZombieSafety == schedulerExpected = CountGold(spine) + +\* Supporting L1 invariants +L1WNonceUnique == + \A e1, e2 \in l1Included : e1.w_nonce = e2.w_nonce => e1 = e2 + +ZombieNotYetIncluded == + \A z \in zombies : z.w_nonce >= nextL1Slot + +L1BeforeCursor == + \A e \in l1Included : e.w_nonce < nextL1Slot + +SchedulerBehindL1 == + schedulerCursor <= nextL1Slot + +Inv == + /\ TypeOK + /\ SpineOrdering + /\ SafeBlockMonotonic + /\ InvalidOnlyOnGold + /\ CurrentStalenessMonotonic + /\ BatchNoncesContiguous + /\ ZombieSafety + /\ L1WNonceUnique + /\ ZombieNotYetIncluded + /\ L1BeforeCursor + /\ SchedulerBehindL1 + +--------------------------------------------------------------------------- +(* Initial state *) + +Init == + /\ spine = <<[index |-> 0, color |-> Tip, safe_block |-> 0, + inclusion_block |-> 0, w_nonce |-> NONE, batch_nonce |-> 0]>> + /\ invalid = <<0>> + /\ nextIndex = 1 + /\ currentSafeBlock = 0 + /\ walletNonce = 0 + /\ zombies = {} + /\ nextL1Slot = 0 + /\ l1Included = {} + /\ schedulerCursor = 0 + /\ schedulerExpected = 0 + +--------------------------------------------------------------------------- +(* + * AdvanceTip: close the current Tip -> Pending, append new Tip. + * Assigns safe_block (from environment) and batch_nonce. + *) +AdvanceTip == + /\ nextIndex <= MaxBatchIndex + /\ LET tipPos == Len(spine) + IN + /\ spine[tipPos].color = Tip + /\ \E sb \in 0..currentSafeBlock : + /\ (tipPos > 1 => sb >= spine[tipPos - 1].safe_block) + /\ spine' = [i \in 1..Len(spine) + 1 |-> + IF i < tipPos THEN spine[i] + ELSE IF i = tipPos + THEN [index |-> spine[tipPos].index, + color |-> Pending, + safe_block |-> sb, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> tipPos - 1] + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> 0, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..Len(spine) + 1 |-> + IF i <= Len(spine) THEN invalid[i] ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SubmitBatch: assign w_nonces to ALL unsubmitted Pending batches + * at once, in spine-position order. This models the real batch + * submitter which reads the on-chain nonce and submits every + * pending batch each tick. + *) +SubmitBatch == + LET unsubPos == {i \in 1..Len(spine) : + spine[i].color = Pending /\ spine[i].w_nonce = NONE} + \* Read on-chain nonce: can't use a slot L1 already consumed + wn0 == IF walletNonce >= nextL1Slot THEN walletNonce ELSE nextL1Slot + IN + /\ unsubPos # {} + /\ spine' = [i \in 1..Len(spine) |-> + IF i \in unsubPos + THEN [spine[i] EXCEPT + !.w_nonce = wn0 + Cardinality({j \in unsubPos : j < i})] + ELSE spine[i]] + /\ walletNonce' = wn0 + Cardinality(unsubPos) + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * L1Include: include one transaction at w_nonce = nextL1Slot. + * + * If both a spine Pending and a zombie exist at this slot, L1 + * non-deterministically picks one (mempool competition). + * + * Spine wins: Pending -> Bronze (or Silver if block already safe). + * Zombie wins: zombie included; competing Pending's w_nonce bumped. + * + * inclusion_block >= currentSafeBlock (L1 monotonicity: transactions + * are included in current or future blocks) and >= all previous + * inclusion blocks (block numbers are monotonic). + *) + +L1IncludeSpine == + LET pos == PendingAtWNonce(spine, nextL1Slot) + IN + /\ pos > 0 + /\ \E ib \in currentSafeBlock..MaxSafeBlock : + \* Block ordering: non-decreasing inclusion_block + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ LET isSafe == ib <= currentSafeBlock + newColor == IF isSafe THEN Silver ELSE Bronze + IN + /\ spine' = [spine EXCEPT ![pos].color = newColor, + ![pos].inclusion_block = ib] + /\ l1Included' = l1Included \union + {[batch_nonce |-> spine[pos].batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> spine[pos].safe_block, + is_safe |-> isSafe]} + /\ nextL1Slot' = nextL1Slot + 1 + \* Kill zombie at this slot if it existed + /\ zombies' = {z \in zombies : z.w_nonce # nextL1Slot} + /\ UNCHANGED <> + +L1IncludeZombie == + /\ \E z \in zombies : z.w_nonce = nextL1Slot + /\ LET z == CHOOSE zz \in zombies : zz.w_nonce = nextL1Slot + IN + \E ib \in currentSafeBlock..MaxSafeBlock : + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ l1Included' = l1Included \union + {[batch_nonce |-> z.batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> z.safe_block, + is_safe |-> (ib <= currentSafeBlock)]} + /\ nextL1Slot' = nextL1Slot + 1 + /\ zombies' = {zz \in zombies : zz.w_nonce # nextL1Slot} + \* If a spine Pending was competing at this slot, reset ALL + \* submitted Pending w_nonces. The batch submitter will + \* re-read the on-chain nonce and resubmit everything. + /\ LET hasConflict == PendingAtWNonce(spine, nextL1Slot) > 0 + IN + IF hasConflict + THEN /\ spine' = [i \in 1..Len(spine) |-> + IF spine[i].color = Pending + /\ spine[i].w_nonce # NONE + THEN [spine[i] EXCEPT !.w_nonce = NONE] + ELSE spine[i]] + /\ walletNonce' = nextL1Slot + 1 + ELSE /\ UNCHANGED spine + /\ UNCHANGED walletNonce + /\ UNCHANGED <> + +L1Include == L1IncludeSpine \/ L1IncludeZombie + +--------------------------------------------------------------------------- +(* + * AdvanceSafeBlock: environment advances the L1 safe block. + * Bronze -> Silver on spine when inclusion_block becomes safe. + * Marks l1Included entries as safe. + *) +AdvanceSafeBlock == + /\ currentSafeBlock < MaxSafeBlock + /\ \E sb \in (currentSafeBlock + 1)..MaxSafeBlock : + /\ currentSafeBlock' = sb + /\ spine' = [i \in 1..Len(spine) |-> + IF spine[i].color = Bronze /\ spine[i].inclusion_block <= sb + THEN [spine[i] EXCEPT !.color = Silver] + ELSE spine[i]] + /\ l1Included' = {[e EXCEPT !.is_safe = + (e.is_safe \/ (e.inclusion_block <= sb))] + : e \in l1Included} + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SchedulerStep: process the L1 entry at schedulerCursor. + * + * The on-chain scheduler sees L1 inputs in w_nonce order and + * maintains an expected batch nonce counter. + * + * Accept: batch_nonce matches AND not stale by inclusion. + * -> increment schedulerExpected, promote spine Silver -> Gold. + * Skip: nonce mismatch OR stale (nonce poisoning). + * -> schedulerExpected unchanged. + * + * If accepted but the batch is not on the spine (zombie was accepted), + * spine is unchanged but schedulerExpected increments. ZombieSafety + * would then be violated — which is exactly what we're proving + * cannot happen. + *) +SchedulerStep == + /\ \E e \in l1Included : e.w_nonce = schedulerCursor /\ e.is_safe + /\ LET entry == CHOOSE e \in l1Included : + e.w_nonce = schedulerCursor /\ e.is_safe + IN + LET stale == entry.inclusion_block - entry.safe_block + >= MAX_WAIT_BLOCKS + accepted == entry.batch_nonce = schedulerExpected /\ ~stale + IN + /\ schedulerCursor' = schedulerCursor + 1 + /\ IF accepted + THEN /\ schedulerExpected' = schedulerExpected + 1 + /\ LET gp == SilverAtBN(spine, schedulerExpected) + IN IF gp > 0 + THEN spine' = [spine EXCEPT ![gp].color = Gold] + ELSE UNCHANGED spine + ELSE /\ UNCHANGED schedulerExpected + /\ UNCHANGED spine + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * Resolve: detect staleness at the frontier, cascade-invalidate, + * create zombies from submitted Pending batches, open recovery Tip. + * + * CRITICAL: the frontier must be Silver (safe on L1) before we + * cascade. This guarantees the stale batch is permanently on L1 + * and the scheduler WILL see it and be poisoned — no mempool + * mutual exclusion can kill it. Detecting staleness on Bronze + * or Pending would allow a race where the recovery batch takes + * the frontier's L1 slot, preventing nonce poisoning and letting + * non-frontier zombies be accepted (see counterexample in commit + * history). + * + * Only submitted Pending batches (w_nonce # NONE) become zombies. + * Bronze/Silver batches are already in l1Included; the scheduler + * will process and reject them (stale or nonce mismatch). + * + * walletNonce is reset to nextL1Slot: the sequencer reads the + * latest on-chain nonce and resubmits from there. + *) +Resolve == + /\ nextIndex <= MaxBatchIndex + /\ LET fng == FirstNonGold(spine) + IN + /\ fng > 0 + /\ fng > 1 \* need a Gold parent + /\ spine[fng].color = Silver \* ONLY Silver — must be safe on L1 + /\ IsStaleByInclusion(spine[fng]) + /\ LET newLen == fng \* (fng-1) Golds + 1 new Tip + \* Zombies from submitted Pending batches in the cascade + newZombies == + {[batch_nonce |-> spine[i].batch_nonce, + w_nonce |-> spine[i].w_nonce, + safe_block |-> spine[i].safe_block] : + i \in {j \in fng..Len(spine) : + spine[j].color = Pending /\ spine[j].w_nonce # NONE}} + IN + /\ spine' = [i \in 1..newLen |-> + IF i < fng THEN spine[i] \* all Gold + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> 0, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..newLen |-> + IF i = fng - 1 + THEN invalid[fng - 1] + (Len(spine) - fng + 1) + ELSE IF i < fng THEN invalid[i] + ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ zombies' = zombies \union newZombies + /\ walletNonce' = nextL1Slot + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* Spec *) + +Next == + \/ AdvanceTip + \/ SubmitBatch + \/ L1Include + \/ AdvanceSafeBlock + \/ SchedulerStep + \/ Resolve + +Spec == Init /\ [][Next]_vars + +========================================================================= diff --git a/docs/recovery/justfile b/docs/recovery/justfile new file mode 100644 index 0000000..a35604a --- /dev/null +++ b/docs/recovery/justfile @@ -0,0 +1,12 @@ +tlc := env("TLC", "tlc") + +# Check the preemptive recovery spec (~90s) +check-preemptive: + {{tlc}} -workers auto -deadlock preemptive.tla + +# Check the optimistic recovery spec (~3min) +check-optimistic: + {{tlc}} -workers auto -deadlock history/optimistic.tla + +# Check all specs +check-all: check-preemptive check-optimistic diff --git a/docs/recovery/preemptive.cfg b/docs/recovery/preemptive.cfg new file mode 100644 index 0000000..a5ce60d --- /dev/null +++ b/docs/recovery/preemptive.cfg @@ -0,0 +1,10 @@ +SPECIFICATION Spec + +CONSTANTS + MaxBatchIndex = 5 + MaxSafeBlock = 5 + MAX_WAIT_BLOCKS = 2 + MaxWalletNonce = 8 + +INVARIANTS + Inv diff --git a/docs/recovery/preemptive.tla b/docs/recovery/preemptive.tla new file mode 100644 index 0000000..1991540 --- /dev/null +++ b/docs/recovery/preemptive.tla @@ -0,0 +1,445 @@ +---------------------------- MODULE preemptive ----------------------------- +(* + * Full operational model of the preemptive recovery design. + * + * Extends V3 with flush modeling: at each w_nonce slot, L1 + * non-deterministically includes the spine batch OR a flush no-op + * (killing the batch). This captures the complete flush lifecycle + * including the case where the frontier batch itself is killed. + * + * A killed batch acts as silent poison: the scheduler never sees it, + * so schedulerExpected stays stuck at its batch_nonce. All subsequent + * batches — whether alive on L1 or dead — have wrong nonces. + * Recovery resubmits the killed batch; if stale by inclusion, Resolve + * cascades; if fresh, the scheduler accepts it. Resolve can also + * discard an aging open Tip whose current-safe-block age has reached + * MAX_WAIT_BLOCKS. + * + * Colors on the spine: Gold* Silver* Bronze* Pending* Tip + * During flush, SpineOrdering can be temporarily violated (a killed + * Pending appears before a surviving Silver). This is transient — + * recovery restores Gold* + Tip. SpineOrdering is NOT checked as + * an invariant. + * + * Proves: ZombieSafety == schedulerExpected = CountGold(spine) + * + * Actions: + * AdvanceTip -- close tip -> Pending, append new Tip + * SubmitBatch -- assign w_nonces to unsubmitted Pendings + * L1IncludeSpine -- spine batch wins its slot -> Bronze/Silver + * L1SkipSpine -- flush no-op wins, spine batch killed + * L1IncludeDead -- dead batch beats its flush no-op + * L1SkipDead -- flush no-op wins, dead batch killed + * AdvanceSafeBlock -- L1 safe block advances, Bronze -> Silver + * SchedulerStep -- scheduler processes next safe entry -> Gold + * SchedulerSkip -- scheduler skips gap (no-op slot) + * Resolve -- stale unresolved frontier -> cascade, recover + *) + +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxBatchIndex, + MaxSafeBlock, + MAX_WAIT_BLOCKS, + MaxWalletNonce \* bound on wallet nonce to keep state space finite + +NONE == -1 + +--------------------------------------------------------------------------- +(* Colors *) + +Gold == "Gold" +Silver == "Silver" +Bronze == "Bronze" +Pending == "Pending" +Tip == "Tip" + +Colors == {Gold, Silver, Bronze, Pending, Tip} + +ColorOrd(c) == + CASE c = Gold -> 0 + [] c = Silver -> 1 + [] c = Bronze -> 2 + [] c = Pending -> 3 + [] c = Tip -> 4 + +--------------------------------------------------------------------------- +(* Variables *) + +VARIABLES + spine, + invalid, + nextIndex, + currentSafeBlock, + walletNonce, + nextL1Slot, + l1Included, + schedulerCursor, + schedulerExpected, + deadBatches + +vars == <> + +--------------------------------------------------------------------------- +(* Helpers *) + +CountGold(s) == Cardinality({i \in 1..Len(s) : s[i].color = Gold}) + +FirstNonGold(s) == + IF \E i \in 1..Len(s) : s[i].color # Gold + THEN CHOOSE i \in 1..Len(s) : + s[i].color # Gold /\ \A j \in 1..i-1 : s[j].color = Gold + ELSE 0 + +PendingAtWNonce(s, wn) == + IF \E i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + ELSE 0 + +SilverAtBN(s, bn) == + IF \E i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + ELSE 0 + +--------------------------------------------------------------------------- +(* Staleness *) + +IsStaleByInclusion(b) == b.inclusion_block - b.safe_block >= MAX_WAIT_BLOCKS + +IsStaleByCurrent(b) == currentSafeBlock - b.safe_block >= MAX_WAIT_BLOCKS + +--------------------------------------------------------------------------- +(* Invariants *) + +TypeOK == + /\ Len(spine) >= 1 + /\ nextIndex \in Nat + /\ currentSafeBlock \in Nat + /\ walletNonce \in Nat + /\ nextL1Slot \in Nat + /\ schedulerCursor \in Nat + /\ schedulerExpected \in Nat + +\* Batch nonces are contiguous (0..N-1) for non-Tip spine elements. +BatchNoncesContiguous == + \A i \in 1..Len(spine) : + spine[i].color # Tip => spine[i].batch_nonce = i - 1 + +\* Dead branches only hang off Gold nodes. +InvalidOnlyOnGold == + \A i \in 1..Len(spine) : invalid[i] > 0 => spine[i].color = Gold + +\* ------- THE KEY THEOREM ------- +\* The scheduler accepts exactly the Gold prefix. +ZombieSafety == schedulerExpected = CountGold(spine) + +\* L1 consistency +L1WNonceUnique == + \A e1, e2 \in l1Included : e1.w_nonce = e2.w_nonce => e1 = e2 + +L1BeforeCursor == + \A e \in l1Included : e.w_nonce < nextL1Slot + +SchedulerBehindL1 == + schedulerCursor <= nextL1Slot + +DeadNotYetIncluded == + \A d \in deadBatches : d.w_nonce >= nextL1Slot + +Inv == + /\ TypeOK + /\ BatchNoncesContiguous + /\ InvalidOnlyOnGold + /\ ZombieSafety + /\ L1WNonceUnique + /\ L1BeforeCursor + /\ SchedulerBehindL1 + /\ DeadNotYetIncluded + +--------------------------------------------------------------------------- +(* Initial state *) + +(* + * Initial state: Genesis sentinel (nonce 0) is already Gold. + * This is a modeling technique that eliminates the nonce-0 edge + * case, allowing Resolve to use uniform logic. The implementation + * can handle nonce-0 however is simplest (see README.md). + * + * Tip.safe_block models the first frame's safe_block of the open batch. + * Keeping it meaningful lets the spec represent a Tip that ages past + * MAX_WAIT_BLOCKS before ever getting an L1 transaction. + *) +Init == + /\ spine = <<[index |-> 0, color |-> Gold, safe_block |-> 0, + inclusion_block |-> 0, w_nonce |-> 0, batch_nonce |-> 0], + [index |-> 1, color |-> Tip, safe_block |-> 0, + inclusion_block |-> 0, w_nonce |-> NONE, batch_nonce |-> 0]>> + /\ invalid = <<0, 0>> + /\ nextIndex = 2 + /\ currentSafeBlock = 0 + /\ walletNonce = 1 + /\ nextL1Slot = 1 + /\ l1Included = {[batch_nonce |-> 0, w_nonce |-> 0, + inclusion_block |-> 0, safe_block |-> 0, + is_safe |-> TRUE]} + /\ schedulerCursor = 1 + /\ schedulerExpected = 1 + /\ deadBatches = {} + +--------------------------------------------------------------------------- +(* AdvanceTip: close tip -> Pending, append new Tip *) + +AdvanceTip == + /\ nextIndex <= MaxBatchIndex + /\ LET tipPos == Len(spine) IN + /\ spine[tipPos].color = Tip + /\ spine[tipPos].safe_block <= currentSafeBlock + /\ (tipPos > 1 => spine[tipPos].safe_block >= spine[tipPos - 1].safe_block) + /\ spine' = [i \in 1..Len(spine) + 1 |-> + IF i < tipPos THEN spine[i] + ELSE IF i = tipPos + THEN [index |-> spine[tipPos].index, + color |-> Pending, + safe_block |-> spine[tipPos].safe_block, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> tipPos - 1] + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> currentSafeBlock, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..Len(spine) + 1 |-> + IF i <= Len(spine) THEN invalid[i] ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SubmitBatch: assign w_nonces to ALL unsubmitted Pending batches + * at once, in spine-position order. + *) +SubmitBatch == + LET unsubPos == {i \in 1..Len(spine) : + spine[i].color = Pending /\ spine[i].w_nonce = NONE} + wn0 == IF walletNonce >= nextL1Slot THEN walletNonce ELSE nextL1Slot + IN + /\ unsubPos # {} + /\ wn0 + Cardinality(unsubPos) <= MaxWalletNonce \* bound check + /\ spine' = [i \in 1..Len(spine) |-> + IF i \in unsubPos + THEN [spine[i] EXCEPT + !.w_nonce = wn0 + Cardinality({j \in unsubPos : j < i})] + ELSE spine[i]] + /\ walletNonce' = wn0 + Cardinality(unsubPos) + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * L1 actions: the L1 stream processes transactions in w_nonce order. + * At each slot, if both a spine batch and a flush no-op exist, + * L1 non-deterministically picks one. + * + * inclusion_block >= currentSafeBlock (L1 monotonicity) and + * >= all previous inclusion_blocks (block ordering). + *) + +\* Spine batch wins its slot -> Bronze or Silver. +L1IncludeSpine == + LET pos == PendingAtWNonce(spine, nextL1Slot) IN + /\ pos > 0 + /\ \E ib \in currentSafeBlock..MaxSafeBlock : + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ LET isSafe == ib <= currentSafeBlock + newColor == IF isSafe THEN Silver ELSE Bronze + IN + /\ spine' = [spine EXCEPT ![pos].color = newColor, + ![pos].inclusion_block = ib] + /\ l1Included' = l1Included \union + {[batch_nonce |-> spine[pos].batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> spine[pos].safe_block, + is_safe |-> isSafe]} + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +\* Flush no-op wins at a spine Pending's slot. +\* The batch is killed: w_nonce reset to NONE. +\* The scheduler never sees it — silent nonce poison. +L1SkipSpine == + LET pos == PendingAtWNonce(spine, nextL1Slot) IN + /\ pos > 0 + /\ spine' = [spine EXCEPT ![pos].w_nonce = NONE] + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +\* Dead batch (from cascade) beats its flush no-op. +L1IncludeDead == + /\ \E d \in deadBatches : d.w_nonce = nextL1Slot + /\ LET d == CHOOSE dd \in deadBatches : dd.w_nonce = nextL1Slot IN + \E ib \in currentSafeBlock..MaxSafeBlock : + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ l1Included' = l1Included \union + {[batch_nonce |-> d.batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> d.safe_block, + is_safe |-> (ib <= currentSafeBlock)]} + /\ deadBatches' = deadBatches \ {d} + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +\* Flush no-op wins at a dead batch's slot. +L1SkipDead == + /\ \E d \in deadBatches : d.w_nonce = nextL1Slot + /\ LET d == CHOOSE dd \in deadBatches : dd.w_nonce = nextL1Slot IN + /\ deadBatches' = deadBatches \ {d} + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +L1Include == + \/ L1IncludeSpine + \/ L1SkipSpine + \/ L1IncludeDead + \/ L1SkipDead + +--------------------------------------------------------------------------- +(* AdvanceSafeBlock: L1 safe block advances, Bronze -> Silver *) + +AdvanceSafeBlock == + /\ currentSafeBlock < MaxSafeBlock + /\ \E sb \in (currentSafeBlock + 1)..MaxSafeBlock : + /\ currentSafeBlock' = sb + /\ spine' = [i \in 1..Len(spine) |-> + IF spine[i].color = Bronze /\ spine[i].inclusion_block <= sb + THEN [spine[i] EXCEPT !.color = Silver] + ELSE spine[i]] + /\ l1Included' = {[e EXCEPT !.is_safe = + (e.is_safe \/ (e.inclusion_block <= sb))] + : e \in l1Included} + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SchedulerStep: process the L1 entry at schedulerCursor. + * Accept: batch_nonce matches AND not stale -> Gold promotion. + * Skip: nonce mismatch OR stale (nonce poisoning). + *) +SchedulerStep == + /\ \E e \in l1Included : e.w_nonce = schedulerCursor /\ e.is_safe + /\ LET entry == CHOOSE e \in l1Included : + e.w_nonce = schedulerCursor /\ e.is_safe + IN + LET stale == entry.inclusion_block - entry.safe_block + >= MAX_WAIT_BLOCKS + accepted == entry.batch_nonce = schedulerExpected /\ ~stale + IN + /\ schedulerCursor' = schedulerCursor + 1 + /\ IF accepted + THEN /\ schedulerExpected' = schedulerExpected + 1 + /\ LET gp == SilverAtBN(spine, schedulerExpected) + IN IF gp > 0 + THEN spine' = [spine EXCEPT ![gp].color = Gold] + ELSE UNCHANGED spine + ELSE /\ UNCHANGED schedulerExpected + /\ UNCHANGED spine + /\ UNCHANGED <> + +(* + * SchedulerSkip: advance cursor over a gap (no-op consumed the slot, + * so no l1Included entry exists). + *) +SchedulerSkip == + /\ schedulerCursor < nextL1Slot + /\ ~(\E e \in l1Included : e.w_nonce = schedulerCursor) + /\ schedulerCursor' = schedulerCursor + 1 + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * Resolve: the oldest unresolved batch is definitely stale -> + * cascade-invalidate. + * + * Two cases are modeled: + * 1. the frontier unresolved batch is Silver and stale by inclusion + * (the submitted-batch zombie path), or + * 2. the frontier unresolved batch is Tip and stale by currentSafeBlock + * (the aging open-batch path). + * + * Cascade-invalidated batches already on L1 (Silver/Bronze) remain + * in l1Included. Submitted Pendings become dead batches. + * Unsubmitted Pendings are discarded. + * + * walletNonce is NOT reset — recovery batches use w_nonces past + * all dead batch slots. + * + * The genesis sentinel guarantees fng > 1 (there is always at + * least one Gold ancestor). + *) +Resolve == + /\ nextIndex <= MaxBatchIndex + /\ LET fng == FirstNonGold(spine) IN + /\ fng > 1 + /\ ((spine[fng].color = Silver /\ IsStaleByInclusion(spine[fng])) + \/ (spine[fng].color = Tip /\ IsStaleByCurrent(spine[fng]))) + /\ LET newLen == fng + newDead == + {[batch_nonce |-> spine[i].batch_nonce, + w_nonce |-> spine[i].w_nonce, + safe_block |-> spine[i].safe_block] : + i \in {j \in fng..Len(spine) : + spine[j].color = Pending /\ spine[j].w_nonce # NONE}} + IN + /\ spine' = [i \in 1..newLen |-> + IF i < fng THEN spine[i] + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> currentSafeBlock, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..newLen |-> + IF i = fng - 1 + THEN invalid[fng - 1] + (Len(spine) - fng + 1) + ELSE IF i < fng THEN invalid[i] + ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ deadBatches' = deadBatches \union newDead + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* Spec *) + +Next == + \/ AdvanceTip + \/ SubmitBatch + \/ L1Include + \/ AdvanceSafeBlock + \/ SchedulerStep + \/ SchedulerSkip + \/ Resolve + +Spec == Init /\ [][Next]_vars + +========================================================================= diff --git a/docs/threat-model/README.md b/docs/threat-model/README.md new file mode 100644 index 0000000..733c040 --- /dev/null +++ b/docs/threat-model/README.md @@ -0,0 +1,101 @@ +# Threat Model + +The security posture this codebase defends against. Defines what is in scope for security review, what is out of scope, and the trust level assigned to each actor and interface. + +See [`../recovery/README.md`](../recovery/README.md) for the recovery subsystem, which operationalizes parts of this threat model (adversarial mempool, fail-stop L1 provider). + +## Assets + +What we are protecting: + +- **Rollup state integrity.** The canonical on-chain state must reflect a deterministic replay of user operations and direct inputs. Any divergence between the sequencer's off-chain view and the scheduler's on-chain execution is a state-integrity failure. +- **Soft-confirmation honesty.** Every soft confirmation issued by the sequencer must land on L1 as promised, or be explicitly revoked via recovery. +- **User funds.** No user operation, replay, or protocol break can cause users to lose funds. +- **Batch-submitter key.** Held in operator infra; not hijackable by network attackers. + +## Actors and trust + +| Actor | Trust | Capabilities | +|-------|-------|--------------| +| InputBox contract | Trusted | Authenticates `msg_sender` on `addInput`. Use correctly; do not model forgery. | +| Our Ethereum node | Trusted, fail-stop | Inside our infra. May become unreachable; will never lie. | +| Fallback RPC (Infura / Alchemy) | Semi-trusted, fail-stop | Liveness fallback during primary outages. May withhold or delay. Never byzantine. | +| Operator env / CLI flags | Trusted | Configuration is authoritative. | +| Batch-submitter private key | Private | Held in operator infra. Not reachable by the network. | +| Sequencer's own code | Trusted (bug-free is a precondition) | Bugs are caught via tests and review, not defended against at runtime. See "self-trust" below. | +| **L1 mempool and block builders** | **Fully adversarial** | May reorder, delay, drop, or selectively include submitted transactions. Private mempools mean "dropped" is indistinguishable from "delayed indefinitely." | +| HTTP clients at `POST /tx` | Untrusted | Arbitrary public callers. May submit malformed, malicious, or replay payloads. | +| WebSocket subscribers at `/ws/subscribe` | Internal, but untrusted for data-exposure | Intended for internal indexers. Treat as public for what is exposed. | +| Direct-input senders on L1 | Untrusted | Arbitrary L1 accounts calling InputBox. May submit any calldata. | + +### Self-trust + +The sequencer trusts that its own code is correct. If the sequencer emits a malformed batch, frame, or user op, it is already in a bug state that requires manual intervention — we do not layer runtime defenses against sequencer self-misbehavior. Recovery addresses liveness failures (infrastructure outages, network partitions, gateway failure), not bug-induced malformed state. + +This is not an excuse to skip validation at trust boundaries. Inputs from untrusted actors are validated rigorously. Internal invariants are enforced by type system, SQL constraints, and tests — not by defensive runtime checks against hypothetical self-misbehavior. + +## In-scope failure modes + +- L1 provider outages (primary and fallback), minutes to hours +- Process crashes at arbitrary points, including mid-transaction +- **Adversarial mempool:** reorder, delay, drop, selective inclusion by builders +- **Zombie transactions:** a submitted batch may sit in a private mempool indefinitely and land long after we believed it was gone. The recovery flusher is load-bearing for this threat: it consumes every pending `w_nonce` slot with a no-op so zombies cannot claim them. +- L1 reorgs up to safe depth +- Malicious `POST /tx` callers: malformed signatures, spoofed sender, replay across chains or apps, nonce manipulation +- Malicious direct-input senders: arbitrary payload, any intent; sender authenticity is guaranteed by InputBox +- Scheduler/sequencer protocol divergence of any kind (ordering, nonce rules, signature validity, fee semantics) + +## Out of scope + +- **DoS, rate limiting, resource exhaustion.** Handled by infrastructure (WAF, load balancer, connection limits). Not addressed at the Rust layer. +- **Byzantine L1 provider.** Our own node; honest by assumption. +- **Byzantine InputBox.** Audited L1 contract; trusted. +- **Memory safety.** Rust eliminates this class. +- **Secrets-at-rest security.** Handled by operator infra (secrets manager, file permissions, encrypted volumes). +- **Supply-chain compromise of dependencies.** Tracked via dependency pinning and out-of-band vulnerability feeds, not by code review. +- **Sequencer self-bugs as an attack vector.** Addressed via correctness review, tests, and manual intervention when they occur — see "Self-trust" above. + +## External assumptions we rely on + +These are preconditions the sequencer takes as given. They are neither "trust" nor "threat" — they are invariants about the environment that must hold for the design to be sound. If they break, the sequencer's safety guarantees degrade. + +### L1 block-time coupling + +The wall-clock fallback in [`sequencer/src/recovery/mod.rs`](../../sequencer/src/recovery/mod.rs) estimates missed blocks as: + +``` +estimated_missed_blocks = (now - last_sync_ms) / SEQ_SECONDS_PER_BLOCK +``` + +This assumes a **known, bounded-variance relationship** between elapsed wall-clock time and mined L1 block count. The assumption has three parts: + +1. **Known average block time** — `SEQ_SECONDS_PER_BLOCK` (default 12s, Ethereum mainnet) accurately reflects the target chain's block cadence. +2. **Bounded variance** — over the danger-threshold window (~4h on mainnet), the delta between `elapsed_seconds / avg_block_time` and actual mined blocks is small. On Ethereum mainnet this holds: slot proposers occasionally skip, but >99% of slots produce a block. +3. **Wall clock is monotonic and accurate** — the host's `SystemTime::now()` does not jump backward significantly or drift. Handled by saturating subtraction against clock backward jumps, but not against systematic drift. + +**Where it matters.** Only on the fallback path — when L1 is unreachable and we cannot observe block numbers directly. When L1 is up, observed block numbers are authoritative and this assumption is not consulted. + +**Violation modes.** +- **Chain with unstable block time.** A chain where average block time drifts substantially (e.g., PoW networks under major hashrate swings) would make the estimate less reliable. Mitigation: `SEQ_SECONDS_PER_BLOCK` should be tuned conservatively (overestimate block time → underestimate missed blocks → more cautious recovery triggers). +- **Operator misconfigures `SEQ_SECONDS_PER_BLOCK`.** Typo or copy-paste error pointing at the wrong chain's cadence. Operator-trust scope. +- **Significant host clock drift.** A sequencer host whose clock lags or leads the real-world by minutes per day could slowly desynchronize its danger estimates from reality. + +**Corollary for test design.** To deterministically exercise the wall-clock fallback, tests must maintain this coupling: when advancing the L1 block count, they should also advance (or simulate) the corresponding wall-clock interval. Our e2e harness does the reverse — it rewinds `l1_safe_head.synced_at_ms` to an older timestamp, which is semantically equivalent to advancing the wall clock. + +## How to apply this doc in code review + +For each code path under review: + +1. **Where does the input come from?** Map the source to the actor table. Untrusted sources require validation; trusted sources do not. +2. **What are the downstream effects?** DB write, signed L1 submission, WS broadcast, process control. The more consequential the effect, the tighter the validation must be. +3. **Does the code assume any actor behaves better than the table says?** Common mistakes: + - Assuming the mempool won't hold a tx indefinitely. + - Assuming a tx we "gave up on" is permanently dead. + - Assuming `safe_block` is current during an RPC outage. + - Assuming the sequencer's own code is correct where a bug would breach a trust boundary (e.g., emit signed state to L1). +4. **Correctness or exploitation?** Both are in scope. Under rollup semantics, a correctness bug that causes state divergence is as severe as a direct exploit. + +## Related documents + +- [`../recovery/README.md`](../recovery/README.md) — recovery design, TLA+ formal verification +- [`../../AGENTS.md`](../../AGENTS.md) — architecture, coding conventions, hot-path invariants diff --git a/examples/app-core/src/application/wallet.rs b/examples/app-core/src/application/wallet.rs index e1db37a..d4f5f55 100644 --- a/examples/app-core/src/application/wallet.rs +++ b/examples/app-core/src/application/wallet.rs @@ -145,14 +145,8 @@ impl Application for WalletApp { }); } - let max_fee = user_op.max_fee; - // Users sign a cap (log-space exponent); sequencer executes against the committed frame fee. - if max_fee < current_fee { - return Err(InvalidReason::InvalidMaxFee { - max_fee, - base_fee: current_fee, - }); - } + // max_fee < current_fee is already checked by the trait default in + // validate_and_execute_user_op. No need to repeat here. let gas_cost = sequencer_core::fee::fee_to_linear(current_fee); let balance = self.balance_of(&sender); @@ -183,33 +177,31 @@ impl Application for WalletApp { let method = Method::from_ssz_bytes(user_op.data.as_slice()).ok(); match method.as_ref() { - Some(Method::Transfer(transfer)) => { - if self.debit_if_possible(sender, transfer.amount) { - self.credit(transfer.to, transfer.amount); - outputs.push(AppOutput::Notice( - TransferNotice { - sender, - recipient: transfer.to, - amount: transfer.amount, - } - .abi_encode(), - )); - } + Some(Method::Transfer(transfer)) if self.debit_if_possible(sender, transfer.amount) => { + self.credit(transfer.to, transfer.amount); + outputs.push(AppOutput::Notice( + TransferNotice { + sender, + recipient: transfer.to, + amount: transfer.amount, + } + .abi_encode(), + )); } - Some(Method::Withdrawal(withdrawal)) => { - if self.debit_if_possible(sender, withdrawal.amount) { - outputs.push(AppOutput::Voucher { - destination: self.config.supported_erc20_token, - value: U256::ZERO, - payload: Erc20Transfer { - recipient: sender, - amount: withdrawal.amount, - } - .abi_encode(), - }); - } + Some(Method::Withdrawal(withdrawal)) + if self.debit_if_possible(sender, withdrawal.amount) => + { + outputs.push(AppOutput::Voucher { + destination: self.config.supported_erc20_token, + value: U256::ZERO, + payload: Erc20Transfer { + recipient: sender, + amount: withdrawal.amount, + } + .abi_encode(), + }); } - None => {} + _ => {} } self.executed_input_count = self.executed_input_count.saturating_add(1); @@ -279,6 +271,8 @@ mod tests { #[test] fn validate_rejects_when_max_fee_below_current_fee() { + use sequencer_core::application::{Application, ExecutionOutcome}; + let mut app = WalletApp::new(WalletConfig::default()); let sender = Address::from_slice(&[0x11; 20]); app.balances.insert(sender, U256::from(10_u64)); @@ -289,15 +283,17 @@ mod tests { data: Vec::::new().into(), }; - let err = app - .validate_user_op(sender, &user_op, 2) - .expect_err("max_fee < current_fee should be invalid"); + // The max_fee < current_fee check now lives in the trait default + // (validate_and_execute_user_op), not in validate_user_op directly. + let result = app + .validate_and_execute_user_op(sender, &user_op, 2) + .expect("should return Ok(Invalid), not Err"); assert_eq!( - err, - InvalidReason::InvalidMaxFee { + result, + ExecutionOutcome::Invalid(InvalidReason::InvalidMaxFee { max_fee: 1, base_fee: 2 - } + }) ); } diff --git a/examples/canonical-app/justfile b/examples/canonical-app/justfile index 10f08f0..f3eb915 100644 --- a/examples/canonical-app/justfile +++ b/examples/canonical-app/justfile @@ -2,7 +2,13 @@ set shell := ["bash", "-euo", "pipefail", "-c"] out_dir := "out" source_date_epoch := "0" +cartesi_machine_version := "0.20.0" +linux_image_release := "v0.20.0" +linux_kernel_filename := "linux-6.5.13-ctsi-1-v0.20.0.bin" linux_kernel := out_dir + "/linux.bin" +linux_kernel_sha512 := linux_kernel + ".sha512" +linux_kernel_url := "https://github.com/cartesi/machine-linux-image/releases/download/" + linux_image_release + "/" + linux_kernel_filename +linux_kernel_sha512_url := linux_kernel_url + ".sha512" rootfs_tar := out_dir + "/canonical-rootfs.tar" rootfs_ext2 := out_dir + "/canonical-rootfs.ext2" machine_image := out_dir + "/canonical-machine-image" @@ -13,7 +19,17 @@ machine_image_sepolia := out_dir + "/canonical-machine-image-sepolia" download-deps: @mkdir -p {{out_dir}} - @if [[ ! -f {{linux_kernel}} ]]; then wget https://github.com/cartesi/image-kernel/releases/download/v0.20.0/linux-6.5.13-ctsi-1-v0.20.0.bin -O {{linux_kernel}}; fi + @kernel_tmp="{{linux_kernel}}.tmp"; checksum_tmp="{{linux_kernel_sha512}}.tmp"; \ + verify_kernel() { (cd {{out_dir}} && shasum -a 512 -c "$(basename {{linux_kernel_sha512}})" >/dev/null); }; \ + if [[ ! -s {{linux_kernel}} || ! -s {{linux_kernel_sha512}} ]] || ! verify_kernel; then \ + rm -f "{{linux_kernel}}" "{{linux_kernel_sha512}}" "$kernel_tmp" "$checksum_tmp"; \ + wget "{{linux_kernel_url}}" -O "$kernel_tmp"; \ + wget "{{linux_kernel_sha512_url}}" -O "$checksum_tmp"; \ + mv "$kernel_tmp" "{{linux_kernel}}"; \ + sed "s# artifacts/[^ ]*\$# $(basename {{linux_kernel}})#" "$checksum_tmp" > "{{linux_kernel_sha512}}"; \ + rm -f "$checksum_tmp"; \ + verify_kernel; \ + fi build-dapp: build-dapp-devnet @@ -59,8 +75,9 @@ clean: rm -rf {{out_dir}} build-machine-image: clean-machine-image build-rootfs-devnet - test -f {{linux_kernel}} || { echo "missing {{linux_kernel}}; run 'just setup' first"; exit 1; } + test -s {{linux_kernel}} || { echo "missing or empty {{linux_kernel}}; run 'just setup' first"; exit 1; } cartesi-machine \ + --assert-version={{cartesi_machine_version}} \ --ram-length=128Mi \ --ram-image={{linux_kernel}} \ --flash-drive=label:root,data_filename:{{rootfs_ext2}} \ @@ -70,8 +87,9 @@ build-machine-image: clean-machine-image build-rootfs-devnet --store={{machine_image}} build-machine-image-sepolia: clean-machine-image-sepolia build-rootfs-sepolia - test -f {{linux_kernel}} || { echo "missing {{linux_kernel}}; run 'just setup' first"; exit 1; } + test -s {{linux_kernel}} || { echo "missing or empty {{linux_kernel}}; run 'just setup' first"; exit 1; } cartesi-machine \ + --assert-version={{cartesi_machine_version}} \ --ram-length=128Mi \ --ram-image={{linux_kernel}} \ --flash-drive=label:root,data_filename:{{rootfs_ext2}} \ diff --git a/examples/canonical-app/src/scheduler/core.rs b/examples/canonical-app/src/scheduler/core.rs index 95618e2..90b49bd 100644 --- a/examples/canonical-app/src/scheduler/core.rs +++ b/examples/canonical-app/src/scheduler/core.rs @@ -13,7 +13,7 @@ pub const DEVNET_SEQUENCER_ADDRESS: Address = address!("0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266"); pub const SEPOLIA_SEQUENCER_ADDRESS: Address = address!("0x16d5FF3Fdd14e2a86FBA77cbcE6B3Cd9C32b8Ff3"); -pub const MAX_WAIT_BLOCKS: u64 = 1200; +pub const MAX_WAIT_BLOCKS: u64 = sequencer_core::MAX_WAIT_BLOCKS; #[derive(Debug, Clone, PartialEq, Eq)] pub struct SchedulerConfig { @@ -187,7 +187,6 @@ impl Scheduler { self.config.max_wait_blocks, inclusion_block, ) { - self.advance_expected_batch_nonce(); return ProcessResult::without_outputs(ProcessOutcome::BatchSkippedStale); } @@ -245,6 +244,8 @@ impl Scheduler { for user_op in &frame.user_ops { if let Some(sender) = self.recover_sender(domain, user_op) { let plain = user_op.to_user_op(); + // Defense-in-depth: the trait default in validate_and_execute_user_op + // now centralizes this check, but we keep it here as an extra guard. if plain.max_fee < frame.fee_price { eprintln!("scheduler skipped frame user-op due to max_fee < fee_price"); continue; @@ -327,13 +328,7 @@ fn has_elapsed_since(start_block: u64, wait_blocks: u64, current_block: u64) -> } pub(super) fn input_domain(chain_id: u64, verifying_contract: Address) -> Eip712Domain { - Eip712Domain { - name: None, - version: None, - chain_id: Some(U256::from(chain_id)), - verifying_contract: Some(verifying_contract), - salt: None, - } + sequencer_core::build_input_domain(chain_id, verifying_contract) } pub(super) fn block_to_u64(block: U256) -> u64 { @@ -619,7 +614,7 @@ mod tests { } #[test] - fn stale_batch_is_skipped_and_consumes_nonce() { + fn stale_batch_is_skipped_without_consuming_nonce() { let mut scheduler = Scheduler::new( RecordingApp::default(), SchedulerConfig { @@ -648,14 +643,16 @@ mod tests { let outcome = scheduler.process_input(batch_input(10, stale_batch)); assert_eq!(outcome, ProcessOutcome::BatchSkippedStale); assert_eq!(scheduler.app.events(), [RecordedTx::Direct(9)]); - assert_eq!(scheduler.next_expected_batch_nonce(), 1); + // Stale batches do NOT consume the nonce — they are true no-ops in nonce space. + assert_eq!(scheduler.next_expected_batch_nonce(), 0); + // The next valid batch reuses nonce 0. let fresh_signing_key = SigningKey::from_bytes((&[13_u8; 32]).into()).expect("fresh signing key"); let fresh_sender = address_from_signing_key(&fresh_signing_key); scheduler.app.credit(fresh_sender, 1); let fresh_batch = Batch { - nonce: 1, + nonce: 0, frames: vec![Frame { user_ops: vec![sign_wire_user_op( &test_domain(), diff --git a/examples/canonical-test/src/main.rs b/examples/canonical-test/src/main.rs index 775df0a..32d57f6 100644 --- a/examples/canonical-test/src/main.rs +++ b/examples/canonical-test/src/main.rs @@ -49,20 +49,23 @@ pub fn scheduler_rejected_batch_does_not_consume_nonce() -> TestResult { } #[testsi::test_dapp(kind("scheduler"))] -pub fn scheduler_stale_batch_consumes_nonce_without_report() -> TestResult { +pub fn scheduler_stale_batch_is_skipped_without_consuming_nonce() -> TestResult { let mut machine = devnet_machine()?; let stale_trigger_block = SchedulerConfig::devnet().max_wait_blocks as usize + 1; + // Stale batch (nonce 0, safe_block 1, inclusion block > max_wait_blocks) → skipped silently. let (outputs, reports) = machine.advance_state(batch_input( stale_trigger_block, batch_with_safe_blocks(0, &[1]), ))?; assert_no_outputs_or_reports(&outputs, &reports); + // Fresh batch with nonce 0 succeeds — stale batch did NOT consume the nonce. let (outputs, reports) = machine.advance_state(batch_input(stale_trigger_block + 1, empty_batch(0)))?; - assert_invalid_batch_step(&outputs, &reports); + assert_no_outputs_or_reports(&outputs, &reports); + // Next batch with nonce 1 also succeeds. let (outputs, reports) = machine.advance_state(batch_input(stale_trigger_block + 2, empty_batch(1)))?; assert_no_outputs_or_reports(&outputs, &reports); @@ -228,13 +231,7 @@ fn devnet_machine() -> Result> } fn input_domain() -> Eip712Domain { - Eip712Domain { - name: None, - version: None, - chain_id: Some(U256::from(TEST_CHAIN_ID)), - verifying_contract: Some(TEST_DAPP_ADDRESS), - salt: None, - } + sequencer_core::build_input_domain(TEST_CHAIN_ID, TEST_DAPP_ADDRESS) } fn signing_key(byte: u8) -> SigningKey { diff --git a/sequencer-core/src/application/mod.rs b/sequencer-core/src/application/mod.rs index d3eb462..671cfc0 100644 --- a/sequencer-core/src/application/mod.rs +++ b/sequencer-core/src/application/mod.rs @@ -102,6 +102,15 @@ pub trait Application: Send { user_op: &UserOp, current_fee: u16, ) -> Result { + // Protocol invariant: max_fee must cover the current frame fee. + // Enforced here so every Application impl inherits it. + if user_op.max_fee < current_fee { + return Ok(ExecutionOutcome::Invalid(InvalidReason::InvalidMaxFee { + max_fee: user_op.max_fee, + base_fee: current_fee, + })); + } + if let Err(reason) = self.validate_user_op(sender, user_op, current_fee) { return Ok(ExecutionOutcome::Invalid(reason)); } diff --git a/sequencer-core/src/batch.rs b/sequencer-core/src/batch.rs index ff20fdd..2f3fa4b 100644 --- a/sequencer-core/src/batch.rs +++ b/sequencer-core/src/batch.rs @@ -4,10 +4,6 @@ use crate::user_op::UserOp; use ssz_derive::{Decode, Encode}; -/// Tag byte for InputBox payloads that are L1 app direct inputs (e.g. deposits). -/// L1/app must post such inputs as `0x00 || body`. Only these are stored (body only) and executed. -pub const INPUT_TAG_DIRECT_INPUT: u8 = 0x00; - // --------------------------------------------------------------------------- // Gas-economics-derived batch sizing // @@ -76,23 +72,140 @@ impl WireUserOp { } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BatchForSubmission { - pub batch_index: u64, - pub created_at_ms: u64, - pub batch: Batch, -} +#[cfg(test)] +mod tests { + use super::*; + use ssz::{Decode, Encode}; + + fn sample_user_op(nonce: u32) -> WireUserOp { + WireUserOp { + nonce, + max_fee: 100, + data: vec![0xaa, 0xbb, 0xcc, 0xdd], + signature: vec![0xee; WireUserOp::SIGNATURE_BYTES], + } + } + + fn sample_frame(safe_block: u64, user_op_count: u32) -> Frame { + Frame { + user_ops: (0..user_op_count).map(sample_user_op).collect(), + safe_block, + fee_price: 42, + } + } + + fn sample_batch(nonce: u64, frame_count: u64) -> Batch { + Batch { + nonce, + frames: (0..frame_count).map(|i| sample_frame(100 + i, 2)).collect(), + } + } + + // ── SSZ round-trip determinism ────────────────────────────────── -impl BatchForSubmission { - /// Encode the batch for the scheduler as a single SSZ payload. - /// - /// Payload is `ssz(Batch { nonce: batch_index, frames })`. The scheduler decodes this - /// and uses `batch.nonce` for deduplication; classification at the rollup is by msg_sender. - pub fn encode_for_scheduler(&self) -> Vec { + #[test] + fn ssz_roundtrip_empty_batch_is_identity() { let batch = Batch { - nonce: self.batch_index, - frames: self.batch.frames.clone(), + nonce: 0, + frames: vec![], }; - ssz::Encode::as_ssz_bytes(&batch) + let encoded = batch.as_ssz_bytes(); + let decoded = Batch::from_ssz_bytes(&encoded).expect("decode empty batch"); + assert_eq!(decoded, batch); + assert_eq!(decoded.as_ssz_bytes(), encoded); + } + + #[test] + fn ssz_roundtrip_populated_batch_is_identity() { + let batch = sample_batch(42, 3); + let encoded = batch.as_ssz_bytes(); + let decoded = Batch::from_ssz_bytes(&encoded).expect("decode populated batch"); + assert_eq!(decoded, batch); + assert_eq!(decoded.as_ssz_bytes(), encoded); + } + + #[test] + fn ssz_roundtrip_frame_with_empty_user_ops_is_identity() { + // Closed-empty frames (direct-input-only) are a real on-wire shape. + let frame = Frame { + user_ops: vec![], + safe_block: 7, + fee_price: 0, + }; + let encoded = frame.as_ssz_bytes(); + let decoded = Frame::from_ssz_bytes(&encoded).expect("decode"); + assert_eq!(decoded, frame); + } + + #[test] + fn ssz_roundtrip_wire_user_op_is_identity() { + let uop = sample_user_op(99); + let encoded = uop.as_ssz_bytes(); + let decoded = WireUserOp::from_ssz_bytes(&encoded).expect("decode wire user op"); + assert_eq!(decoded, uop); + } + + #[test] + fn ssz_encoding_is_deterministic_across_calls() { + // Determinism under the same input is a consensus requirement; encoding + // the same batch twice must produce byte-identical output. + let batch = sample_batch(7, 2); + assert_eq!(batch.as_ssz_bytes(), batch.as_ssz_bytes()); + } + + // ── Decode robustness (no panics on adversarial bytes) ────────── + + #[test] + fn ssz_decode_empty_payload_returns_error() { + assert!(Batch::from_ssz_bytes(&[]).is_err()); + } + + #[test] + fn ssz_decode_below_fixed_header_returns_error() { + // Batch's fixed portion is 8 (nonce) + 4 (frames offset) = 12 bytes. + for len in 0..12 { + let buf = vec![0u8; len]; + assert!( + Batch::from_ssz_bytes(&buf).is_err(), + "decoding {len} bytes below fixed header must fail", + ); + } + } + + #[test] + fn ssz_decode_truncated_valid_batch_returns_error() { + let batch = sample_batch(1, 2); + let full = batch.as_ssz_bytes(); + // Truncating anywhere before the full length must not round-trip. + for cut in 0..full.len() { + let truncated = &full[..cut]; + match Batch::from_ssz_bytes(truncated) { + Err(_) => {} + Ok(decoded) => assert_ne!( + decoded, batch, + "truncation at {cut} silently decoded to the original batch", + ), + } + } + } + + #[test] + fn ssz_decode_invalid_offset_returns_error() { + // Well-formed nonce (8 zero bytes), frames offset points far past the + // buffer end. SSZ must reject rather than read out of bounds. + let mut buf = vec![0u8; 12]; + buf[8..12].copy_from_slice(&0xffff_ffff_u32.to_le_bytes()); + assert!(Batch::from_ssz_bytes(&buf).is_err()); + } + + #[test] + fn ssz_decode_garbage_bytes_never_panics() { + // Adversarial fixed patterns. Decoding may Err or Ok; the invariant we + // care about is "no panic" — the test passing proves it. + for pattern in [0x00, 0x01, 0x42, 0x7f, 0x80, 0xff] { + for len in [1, 12, 64, 256, 1024] { + let _ = Batch::from_ssz_bytes(&vec![pattern; len]); + } + } } } diff --git a/sequencer-core/src/lib.rs b/sequencer-core/src/lib.rs index fe33e65..3f645ee 100644 --- a/sequencer-core/src/lib.rs +++ b/sequencer-core/src/lib.rs @@ -1,10 +1,40 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +use alloy_primitives::{Address, U256}; +use alloy_sol_types::Eip712Domain; + pub mod api; pub mod application; pub mod batch; pub mod broadcast; pub mod fee; pub mod l2_tx; +pub mod protocol; pub mod user_op; + +/// Maximum number of L1 blocks a batch can wait before the scheduler considers it stale. +/// Shared between the scheduler (canonical-app) and the sequencer (batch submitter, startup detection). +pub const MAX_WAIT_BLOCKS: u64 = 1200; + +/// EIP-712 domain name shared between sequencer and scheduler. +pub const DOMAIN_NAME: &str = "CartesiAppSequencer"; + +/// EIP-712 domain version shared between sequencer and scheduler. +pub const DOMAIN_VERSION: &str = "1"; + +/// Build the canonical EIP-712 domain for user-op signing and verification. +/// +/// Both the sequencer (signature verification at ingress) and the scheduler +/// (signature recovery during batch execution) MUST use this constructor. +/// A mismatch in any field changes the domain separator and causes every +/// signature to recover a different address. +pub fn build_input_domain(chain_id: u64, verifying_contract: Address) -> Eip712Domain { + Eip712Domain { + name: Some(DOMAIN_NAME.into()), + version: Some(DOMAIN_VERSION.into()), + chain_id: Some(U256::from(chain_id)), + verifying_contract: Some(verifying_contract), + salt: None, + } +} diff --git a/sequencer-core/src/protocol.rs b/sequencer-core/src/protocol.rs new file mode 100644 index 0000000..f4ac2d7 --- /dev/null +++ b/sequencer-core/src/protocol.rs @@ -0,0 +1,600 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Protocol timing parameters the sequencer mirrors from the scheduler, plus +//! the sequencer-side tuning knobs that govern preemptive self-protection. +//! +//! [`ProtocolTiming`] is the single source of truth for time-based protocol +//! rules: +//! +//! - **Scheduler-acceptance** timing (`is_scheduler_stale`, `scheduler_accepts`). +//! `max_wait_blocks` matches the on-chain scheduler exactly — mis-aligning +//! it would cause the sequencer's cached "gold frontier" to diverge from +//! the scheduler's actual accepted set. +//! - **Preemptive-recovery** tuning (`danger_threshold`, +//! `l1_read_stale_after_blocks`, `seconds_per_block`). These do not exist +//! on the scheduler side; they control when the sequencer proactively +//! stops to avoid issuing soft confirmations against stale or unknowable +//! L1 state. +//! +//! The batch-submitter address is **not** part of `ProtocolTiming`. It's an +//! identity, not a timing parameter; the validation in +//! [`ProtocolTiming::try_new`] only relates timing fields to each other. +//! Predicates that need the address (`scheduler_accepts`) take it as a +//! parameter — see those methods. + +use crate::batch::Batch; +use alloy_primitives::Address; +use thiserror::Error; + +/// Error surfaced by [`ProtocolTiming::try_new`] when timing fields would +/// produce an unusable danger threshold. +/// +/// Returning a typed error rather than panicking lets the runtime convert this +/// into a `Result` at config-parse time and surface it through the structured +/// `RunError` taxonomy, instead of crashing later inside +/// [`ProtocolTiming::danger_threshold`] (or worse, inside a logging macro). +#[derive(Debug, Error, PartialEq, Eq)] +pub enum ProtocolTimingError { + /// `preemptive_margin_blocks == 0` — zero-width preemptive zone defeats + /// the entire margin design (recovery only ever fires at the absolute max, + /// with no operator runway). Always an operator misconfig. + #[error("preemptive_margin_blocks must be greater than zero")] + MarginZero, + /// `preemptive_margin_blocks >= max_wait_blocks` — the danger threshold + /// would be 0, making preemptive recovery indistinguishable from hard + /// staleness. The margin is supposed to be operator runway *before* + /// hitting `MAX_WAIT_BLOCKS`, so this is always an operator misconfig. + #[error( + "preemptive_margin_blocks ({margin}) must be strictly less than \ + max_wait_blocks ({max_wait})" + )] + MarginNotLessThanMaxWait { margin: u64, max_wait: u64 }, + /// `l1_read_stale_after_blocks == 0` would make the L1 view unusable + /// immediately, so the sequencer could never start. + #[error("l1_read_stale_after_blocks must be greater than zero")] + ReadStaleAfterZero, + /// The read-staleness threshold must fire strictly before the write danger + /// threshold. Equality would have both refusal arms (L1ViewStale and + /// EstimatedBatchInDanger) trip at the same point, defeating the + /// "stale fires first" design property. + #[error( + "l1_read_stale_after_blocks ({read_stale_after}) must be strictly \ + less than danger_threshold ({danger_threshold})" + )] + ReadStaleAfterPastDanger { + read_stale_after: u64, + danger_threshold: u64, + }, +} + +/// Time-based protocol parameters: scheduler-mirroring `max_wait_blocks` +/// plus sequencer-side preemptive-recovery tuning. No identity (the +/// batch-submitter address is passed separately to predicates that need it). +/// +/// Construct via [`ProtocolTiming::try_new`] in production code so the +/// margin/stale relationships are checked once up front. The fields stay +/// public to keep test fixtures concise — direct struct-literal construction +/// is fine where the inputs are controlled. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ProtocolTiming { + /// `MAX_WAIT_BLOCKS` — after this many L1 blocks, the scheduler skips a + /// submitted batch as stale. + pub max_wait_blocks: u64, + /// How many blocks before `max_wait_blocks` the sequencer triggers + /// preemptive recovery. Sequencer-local; must be strictly less than + /// `max_wait_blocks` (enforced by [`ProtocolTiming::try_new`]). + pub preemptive_margin_blocks: u64, + /// How long the L1 safe block itself may be stale, measured in assumed L1 + /// blocks. Sequencer-local; startup refuses once the safe block timestamp + /// is older than this threshold. + pub l1_read_stale_after_blocks: u64, + /// Wall-clock estimate of L1 block time, used as a fallback when the L1 + /// safe head appears frozen. Sequencer-local. + pub seconds_per_block: u64, +} + +impl ProtocolTiming { + /// Validated constructor. Rejects timing configurations that would + /// produce an unusable danger threshold or a degenerate margin. + /// + /// Production callers should use this; tests can still construct + /// `ProtocolTiming` directly via struct-literal syntax with controlled + /// inputs. + pub fn try_new( + max_wait_blocks: u64, + preemptive_margin_blocks: u64, + l1_read_stale_after_blocks: u64, + seconds_per_block: u64, + ) -> Result { + if preemptive_margin_blocks == 0 { + return Err(ProtocolTimingError::MarginZero); + } + if preemptive_margin_blocks >= max_wait_blocks { + return Err(ProtocolTimingError::MarginNotLessThanMaxWait { + margin: preemptive_margin_blocks, + max_wait: max_wait_blocks, + }); + } + if l1_read_stale_after_blocks == 0 { + return Err(ProtocolTimingError::ReadStaleAfterZero); + } + let danger_threshold = max_wait_blocks - preemptive_margin_blocks; + if l1_read_stale_after_blocks >= danger_threshold { + return Err(ProtocolTimingError::ReadStaleAfterPastDanger { + read_stale_after: l1_read_stale_after_blocks, + danger_threshold, + }); + } + Ok(Self { + max_wait_blocks, + preemptive_margin_blocks, + l1_read_stale_after_blocks, + seconds_per_block, + }) + } + + /// The block-age threshold at which preemptive recovery triggers. + /// + /// `saturating_sub` keeps this infallible even on a directly-constructed + /// `ProtocolTiming` with an invalid margin (returns 0 in that case). + /// Production code goes through [`ProtocolTiming::try_new`], which rejects + /// that configuration up front. + pub fn danger_threshold(&self) -> u64 { + self.max_wait_blocks + .saturating_sub(self.preemptive_margin_blocks) + } + + /// Wall-clock age, in seconds, after which the L1 safe block is too old + /// for the sequencer to trust its L1 view. + pub fn l1_read_stale_after_secs(&self) -> u64 { + self.l1_read_stale_after_blocks + .saturating_mul(self.seconds_per_block.max(1)) + } + + /// Whether the safe block timestamp is too old to support recovery or + /// continued soft confirmations. `None` means the view is unknown and is + /// treated as unusable. + pub fn l1_view_is_stale(&self, safe_block_timestamp_secs: Option, now_ms: u64) -> bool { + let Some(timestamp_secs) = safe_block_timestamp_secs else { + return true; + }; + let now_secs = now_ms / 1000; + now_secs.saturating_sub(timestamp_secs) >= self.l1_read_stale_after_secs() + } + + /// Wall-clock-adjusted danger threshold, used when the L1 safe head may be + /// stale. + /// + /// Translates "wall-clock time elapsed since the last L1 safe-head + /// observation" into "blocks the safe head is presumed to be behind", and + /// shaves that off the strict [`Self::danger_threshold`]. Returns `None` + /// when the wall-clock arm should be skipped: + /// + /// - `last_safe_progress_ms` is `None` — no baseline to extrapolate from. + /// - Less than one block-time has elapsed — adjustment would be 0, so the + /// strict check covers this case directly. + /// + /// Returns `Some(adjusted)` where + /// `adjusted = danger_threshold − (elapsed_secs / seconds_per_block)`, + /// saturating at 0. + pub fn wall_clock_adjusted_danger_threshold( + &self, + last_safe_progress_ms: Option, + now_ms: u64, + ) -> Option { + let last = last_safe_progress_ms?; + let elapsed_secs = now_ms.saturating_sub(last) / 1000; + let missed = elapsed_secs / self.seconds_per_block.max(1); + if missed == 0 { + return None; + } + Some(self.danger_threshold().saturating_sub(missed)) + } + + /// Scheduler's staleness predicate: a batch is stale when + /// `inclusion_block - first_frame_safe_block >= max_wait_blocks`. Used by + /// the scheduler to skip stale submissions, and by the sequencer's + /// frontier simulator to match that behavior. + pub fn is_scheduler_stale(&self, inclusion_block: u64, first_frame_safe_block: u64) -> bool { + age_exceeds( + inclusion_block, + first_frame_safe_block, + self.max_wait_blocks, + ) + } + + /// Off-chain simulation of the scheduler's batch-acceptance predicate. + /// + /// Returns `Some(AcceptedBatch)` iff the scheduler would accept the input + /// at the given `expected_nonce`. The caller threads `expected_nonce` + /// across a stream of inputs, advancing by one on each `Some`. + /// + /// `batch_submitter` is the address the scheduler accepts batches from — + /// kept as a parameter (rather than a struct field) because it's + /// orthogonal to timing and the validation in [`Self::try_new`] doesn't + /// touch it. + /// + /// Rejection paths (wrong sender, SSZ decode failure, stale by inclusion, + /// nonce mismatch) return `None` without advancing — matching what the + /// scheduler does on-chain. + pub fn scheduler_accepts( + &self, + batch_submitter: Address, + input: SafeInputView<'_>, + expected_nonce: u64, + ) -> Option { + if input.sender != batch_submitter { + return None; + } + let batch = ::from_ssz_bytes(input.payload).ok()?; + let first_frame_safe_block = batch.frames.first().map(|f| f.safe_block).unwrap_or(0); + if !batch.frames.is_empty() + && self.is_scheduler_stale(input.inclusion_block, first_frame_safe_block) + { + return None; + } + if batch.nonce != expected_nonce { + return None; + } + Some(AcceptedBatch { + safe_input_index: input.safe_input_index, + nonce: batch.nonce, + first_frame_safe_block, + inclusion_block: input.inclusion_block, + }) + } +} + +/// Generic "age exceeds threshold" predicate shared between scheduler-staleness +/// and the preemptive danger-zone check. Saturating subtraction keeps the +/// arithmetic total over pathological inputs (safe head below a batch's first +/// frame). +pub fn age_exceeds(reference_block: u64, first_frame_safe_block: u64, threshold: u64) -> bool { + reference_block.saturating_sub(first_frame_safe_block) >= threshold +} + +/// Borrowed view of one safe-input row, in the shape scheduler_accepts needs. +/// Using a borrowed payload avoids copying during iteration. +#[derive(Debug, Clone, Copy)] +pub struct SafeInputView<'a> { + pub safe_input_index: u64, + pub sender: Address, + pub payload: &'a [u8], + pub inclusion_block: u64, +} + +/// One batch submission the scheduler would accept as part of its gold frontier. +#[derive(Debug, Clone, Copy)] +pub struct AcceptedBatch { + pub safe_input_index: u64, + pub nonce: u64, + pub first_frame_safe_block: u64, + pub inclusion_block: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::batch::{Batch, Frame}; + + const SUBMITTER: Address = Address::repeat_byte(0xAA); + const OTHER: Address = Address::repeat_byte(0xBB); + const MAX_WAIT: u64 = 1200; + + fn timing() -> ProtocolTiming { + ProtocolTiming { + max_wait_blocks: MAX_WAIT, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + } + } + + fn encode(batch: &Batch) -> Vec { + ssz::Encode::as_ssz_bytes(batch) + } + + fn single_frame_batch(nonce: u64, safe_block: u64) -> Batch { + Batch { + nonce, + frames: vec![Frame { + user_ops: vec![], + safe_block, + fee_price: 0, + }], + } + } + + #[test] + fn danger_threshold_is_max_wait_minus_margin() { + assert_eq!(timing().danger_threshold(), MAX_WAIT - 75); + } + + #[test] + fn l1_read_stale_after_secs_uses_configured_block_time() { + assert_eq!(timing().l1_read_stale_after_secs(), 900 * 12); + } + + #[test] + fn l1_view_is_stale_without_timestamp() { + assert!(timing().l1_view_is_stale(None, 1_000_000)); + } + + #[test] + fn l1_view_is_stale_at_threshold() { + let cfg = timing(); + let timestamp_secs = 1_000; + let before = (timestamp_secs + cfg.l1_read_stale_after_secs() - 1) * 1000; + let at = (timestamp_secs + cfg.l1_read_stale_after_secs()) * 1000; + assert!(!cfg.l1_view_is_stale(Some(timestamp_secs), before)); + assert!(cfg.l1_view_is_stale(Some(timestamp_secs), at)); + } + + #[test] + fn wall_clock_adjusted_threshold_returns_none_without_baseline() { + assert_eq!( + timing().wall_clock_adjusted_danger_threshold(None, 1_000_000), + None, + ); + } + + #[test] + fn wall_clock_adjusted_threshold_returns_none_below_one_block_time() { + // 11 seconds elapsed, seconds_per_block = 12 → missed = 0. + let last = 1_000_000; + let now = last + 11_000; + assert_eq!( + timing().wall_clock_adjusted_danger_threshold(Some(last), now), + None, + ); + } + + #[test] + fn wall_clock_adjusted_threshold_shaves_one_block_per_block_time() { + // 25 blocks of elapsed time (300s) → adjusted = danger_threshold - 25. + let last = 1_000_000; + let now = last + 300_000; + let cfg = timing(); + assert_eq!( + cfg.wall_clock_adjusted_danger_threshold(Some(last), now), + Some(cfg.danger_threshold() - 25), + ); + } + + #[test] + fn wall_clock_adjusted_threshold_saturates_at_zero() { + // Wildly long outage — adjustment shaves more blocks than the threshold. + let last = 0; + let now = u64::MAX / 2; + assert_eq!( + timing().wall_clock_adjusted_danger_threshold(Some(last), now), + Some(0), + ); + } + + #[test] + fn danger_threshold_saturates_to_zero_on_invalid_margin() { + // try_new rejects this configuration; if a test ever constructs it + // directly via struct-literal syntax, danger_threshold returns 0 + // rather than panicking. (Cleaner than a hard panic during a logging + // macro on production startup.) + let cfg = ProtocolTiming { + preemptive_margin_blocks: MAX_WAIT, + ..timing() + }; + assert_eq!(cfg.danger_threshold(), 0); + + let cfg = ProtocolTiming { + preemptive_margin_blocks: MAX_WAIT + 1, + ..timing() + }; + assert_eq!(cfg.danger_threshold(), 0); + } + + #[test] + fn try_new_rejects_margin_equal_to_max_wait() { + assert_eq!( + ProtocolTiming::try_new(MAX_WAIT, MAX_WAIT, 1, 12), + Err(ProtocolTimingError::MarginNotLessThanMaxWait { + margin: MAX_WAIT, + max_wait: MAX_WAIT, + }), + ); + } + + #[test] + fn try_new_rejects_margin_greater_than_max_wait() { + assert_eq!( + ProtocolTiming::try_new(MAX_WAIT, MAX_WAIT + 1, 1, 12), + Err(ProtocolTimingError::MarginNotLessThanMaxWait { + margin: MAX_WAIT + 1, + max_wait: MAX_WAIT, + }), + ); + } + + #[test] + fn try_new_accepts_max_margin_admitting_valid_stale() { + // margin = MAX_WAIT - 2 is the largest margin that still admits a + // valid stale value (stale must satisfy 0 < stale < danger, and + // danger = MAX_WAIT - margin, so danger >= 2 is required for stale = 1 + // to be valid). + let cfg = ProtocolTiming::try_new(MAX_WAIT, MAX_WAIT - 2, 1, 12) + .expect("margin admitting stale = 1 must be accepted"); + assert_eq!(cfg.danger_threshold(), 2); + } + + #[test] + fn try_new_rejects_zero_margin() { + assert_eq!( + ProtocolTiming::try_new(MAX_WAIT, 0, 1, 12), + Err(ProtocolTimingError::MarginZero), + ); + } + + #[test] + fn try_new_rejects_zero_l1_read_stale_after() { + assert_eq!( + ProtocolTiming::try_new(MAX_WAIT, 75, 0, 12), + Err(ProtocolTimingError::ReadStaleAfterZero), + ); + } + + #[test] + fn try_new_rejects_l1_read_stale_after_past_danger() { + let danger_threshold = MAX_WAIT - 75; + assert_eq!( + ProtocolTiming::try_new(MAX_WAIT, 75, danger_threshold + 1, 12), + Err(ProtocolTimingError::ReadStaleAfterPastDanger { + read_stale_after: danger_threshold + 1, + danger_threshold, + }), + ); + } + + #[test] + fn try_new_rejects_l1_read_stale_after_equal_to_danger() { + // Strict less-than: equality means L1ViewStale and + // EstimatedBatchInDanger fire at the same point, defeating the + // "stale fires first" design property. + let danger_threshold = MAX_WAIT - 75; + assert_eq!( + ProtocolTiming::try_new(MAX_WAIT, 75, danger_threshold, 12), + Err(ProtocolTimingError::ReadStaleAfterPastDanger { + read_stale_after: danger_threshold, + danger_threshold, + }), + ); + } + + #[test] + fn age_exceeds_saturates_on_underflow() { + assert!(!age_exceeds(5, 10, 1)); + assert!(age_exceeds(1200, 0, 1200)); + assert!(!age_exceeds(1199, 0, 1200)); + } + + // ── ProtocolTiming::is_scheduler_stale direct boundary tests ────────── + // + // Indirectly covered by `scheduler_accepts_boundary_just_below_stale`, but + // the staleness predicate is load-bearing on its own (the scheduler skips + // submissions that trip it) and deserves direct tests that don't go through + // SSZ decoding. + + #[test] + fn is_scheduler_stale_reports_false_below_threshold() { + // age = inclusion - first = MAX_WAIT - 1, strictly below. + assert!(!timing().is_scheduler_stale(MAX_WAIT, 1)); + // age = 0 (safe head right at the first frame). + assert!(!timing().is_scheduler_stale(100, 100)); + } + + #[test] + fn is_scheduler_stale_reports_true_at_and_past_threshold() { + // age = MAX_WAIT exactly — `>=` comparison trips. + assert!(timing().is_scheduler_stale(MAX_WAIT, 0)); + // age = MAX_WAIT + 1, clearly past. + assert!(timing().is_scheduler_stale(MAX_WAIT + 1, 0)); + } + + #[test] + fn is_scheduler_stale_saturates_when_first_frame_is_ahead() { + // Degenerate input: safe head is behind the first frame's safe_block. + // Saturating subtraction yields 0, strictly below threshold — never stale. + assert!(!timing().is_scheduler_stale(50, 100)); + } + + #[test] + fn scheduler_accepts_fresh_batch_with_matching_nonce() { + let payload = encode(&single_frame_batch(3, 100)); + let input = SafeInputView { + safe_input_index: 7, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: 500, + }; + let accepted = timing() + .scheduler_accepts(SUBMITTER, input, 3) + .expect("matching nonce + fresh inclusion should be accepted"); + assert_eq!(accepted.safe_input_index, 7); + assert_eq!(accepted.nonce, 3); + assert_eq!(accepted.first_frame_safe_block, 100); + assert_eq!(accepted.inclusion_block, 500); + } + + #[test] + fn scheduler_rejects_wrong_sender() { + let payload = encode(&single_frame_batch(0, 0)); + let input = SafeInputView { + safe_input_index: 0, + sender: OTHER, + payload: payload.as_slice(), + inclusion_block: 0, + }; + assert!(timing().scheduler_accepts(SUBMITTER, input, 0).is_none()); + } + + #[test] + fn scheduler_rejects_stale_by_inclusion() { + let payload = encode(&single_frame_batch(0, 0)); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: MAX_WAIT, + }; + assert!(timing().scheduler_accepts(SUBMITTER, input, 0).is_none()); + } + + #[test] + fn scheduler_accepts_boundary_just_below_stale() { + let payload = encode(&single_frame_batch(0, 1)); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: MAX_WAIT, + }; + assert!(timing().scheduler_accepts(SUBMITTER, input, 0).is_some()); + } + + #[test] + fn scheduler_rejects_nonce_mismatch() { + let payload = encode(&single_frame_batch(2, 100)); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: 200, + }; + assert!(timing().scheduler_accepts(SUBMITTER, input, 3).is_none()); + assert!(timing().scheduler_accepts(SUBMITTER, input, 1).is_none()); + } + + #[test] + fn scheduler_rejects_garbage_payload() { + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: &[0xFF, 0xEE, 0xDD], + inclusion_block: 0, + }; + assert!(timing().scheduler_accepts(SUBMITTER, input, 0).is_none()); + } + + #[test] + fn scheduler_accepts_empty_frames_batch_regardless_of_age() { + let payload = encode(&Batch { + nonce: 0, + frames: vec![], + }); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: MAX_WAIT.saturating_mul(10), + }; + assert!(timing().scheduler_accepts(SUBMITTER, input, 0).is_some()); + } +} diff --git a/sequencer/Cargo.toml b/sequencer/Cargo.toml index 2d8d0d6..a2343b2 100644 --- a/sequencer/Cargo.toml +++ b/sequencer/Cargo.toml @@ -40,3 +40,7 @@ tokio-tungstenite = "0.28" k256 = "0.13.4" tempfile = "3" sequencer-rust-client = { path = "../sdk/rust-client" } +# Used for `TcpProxy` in inline tests that need to simulate provider disconnect +# (e.g., flusher survives extended outage). The sequencer crate doesn't depend +# on `rollups-harness` in production; only the test profile pulls it in. +rollups-harness = { path = "../tests/harness" } diff --git a/sequencer/src/api/error.rs b/sequencer/src/api/error.rs deleted file mode 100644 index 9a75d76..0000000 --- a/sequencer/src/api/error.rs +++ /dev/null @@ -1,115 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use axum::Json; -use axum::http::StatusCode; -use axum::response::{IntoResponse, Response}; -use serde::Serialize; -use thiserror::Error; - -use crate::inclusion_lane::SequencerError; -use sequencer_core::api::TxRequestError; - -#[derive(Debug, Error, Clone)] -pub enum ApiError { - #[error("{0}")] - BadRequest(String), - #[error("{0}")] - PayloadTooLarge(String), - #[error("{0}")] - InvalidSignature(String), - #[error("{0}")] - ExecutionRejected(String), - #[error("{0}")] - Unavailable(String), - #[error("{0}")] - InternalError(String), - #[error("{0}")] - Overloaded(String), -} - -#[derive(Debug, Serialize)] -struct ErrorResponse { - ok: bool, - code: &'static str, - message: String, -} - -impl ApiError { - pub fn bad_request(message: impl Into) -> Self { - Self::BadRequest(message.into()) - } - - pub fn payload_too_large(message: impl Into) -> Self { - Self::PayloadTooLarge(message.into()) - } - - pub fn invalid_signature(message: impl Into) -> Self { - Self::InvalidSignature(message.into()) - } - - pub fn internal_error(message: impl Into) -> Self { - Self::InternalError(message.into()) - } - - pub fn unavailable(message: impl Into) -> Self { - Self::Unavailable(message.into()) - } - - pub fn overloaded(message: impl Into) -> Self { - Self::Overloaded(message.into()) - } - - pub fn status(&self) -> StatusCode { - match self { - Self::BadRequest(_) | Self::InvalidSignature(_) => StatusCode::BAD_REQUEST, - Self::PayloadTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, - Self::ExecutionRejected(_) => StatusCode::UNPROCESSABLE_ENTITY, - Self::Unavailable(_) => StatusCode::SERVICE_UNAVAILABLE, - Self::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR, - Self::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS, - } - } - - pub fn code(&self) -> &'static str { - match self { - Self::BadRequest(_) => "BAD_REQUEST", - Self::PayloadTooLarge(_) => "PAYLOAD_TOO_LARGE", - Self::InvalidSignature(_) => "INVALID_SIGNATURE", - Self::ExecutionRejected(_) => "EXECUTION_REJECTED", - Self::Unavailable(_) => "UNAVAILABLE", - Self::InternalError(_) => "INTERNAL_ERROR", - Self::Overloaded(_) => "OVERLOADED", - } - } -} - -impl From for ApiError { - fn from(value: SequencerError) -> Self { - match value { - SequencerError::Invalid(message) => Self::ExecutionRejected(message), - SequencerError::Unavailable(message) => Self::Unavailable(message), - SequencerError::Internal(message) => Self::InternalError(message), - } - } -} - -impl From for ApiError { - fn from(value: TxRequestError) -> Self { - match value { - TxRequestError::BadRequest(message) => Self::BadRequest(message), - TxRequestError::InvalidSignature(message) => Self::InvalidSignature(message), - } - } -} - -impl IntoResponse for ApiError { - fn into_response(self) -> Response { - let body = ErrorResponse { - ok: false, - code: self.code(), - message: self.to_string(), - }; - (self.status(), Json(body)).into_response() - } -} diff --git a/sequencer/src/api/mod.rs b/sequencer/src/api/mod.rs deleted file mode 100644 index cd7cf1e..0000000 --- a/sequencer/src/api/mod.rs +++ /dev/null @@ -1,118 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -mod error; -mod state; -mod tx; -mod ws; - -use std::io; -use std::sync::Arc; - -use alloy_sol_types::Eip712Domain; -use axum::Router; -use axum::extract::DefaultBodyLimit; -use axum::http::StatusCode; -use axum::routing::{get, post}; -use tokio::sync::mpsc; -use tower_http::trace::TraceLayer; - -pub use error::ApiError; -use state::ApiState; - -use crate::inclusion_lane::PendingUserOp; -use crate::l2_tx_feed::L2TxFeed; -use crate::shutdown::ShutdownSignal; -use sequencer_core::api::TxRequest; - -const DEFAULT_WS_MAX_SUBSCRIBERS: usize = 64; -const DEFAULT_WS_MAX_CATCHUP_EVENTS: u64 = 50_000; -const DEFAULT_MAX_BODY_BYTES: usize = TxRequest::MAX_JSON_BYTES_RECOMMENDED; -pub const WS_CATCHUP_WINDOW_EXCEEDED_REASON: &str = "catch-up window exceeded"; - -pub type ApiServerTask = tokio::task::JoinHandle>; - -#[derive(Debug, Clone, Copy)] -pub struct ApiConfig { - pub max_body_bytes: usize, - pub ws_max_subscribers: usize, - pub ws_max_catchup_events: u64, -} - -impl Default for ApiConfig { - fn default() -> Self { - Self { - max_body_bytes: DEFAULT_MAX_BODY_BYTES, - ws_max_subscribers: DEFAULT_WS_MAX_SUBSCRIBERS, - ws_max_catchup_events: DEFAULT_WS_MAX_CATCHUP_EVENTS, - } - } -} - -pub async fn start( - http_addr: impl tokio::net::ToSocketAddrs, - tx_sender: mpsc::Sender, - domain: Eip712Domain, - max_user_op_data_bytes: usize, - shutdown: ShutdownSignal, - tx_feed: L2TxFeed, - config: ApiConfig, -) -> io::Result { - let listener = tokio::net::TcpListener::bind(http_addr).await?; - Ok(start_on_listener( - listener, - tx_sender, - domain, - max_user_op_data_bytes, - shutdown, - tx_feed, - config, - )) -} - -pub fn start_on_listener( - listener: tokio::net::TcpListener, - tx_sender: mpsc::Sender, - domain: Eip712Domain, - max_user_op_data_bytes: usize, - shutdown: ShutdownSignal, - tx_feed: L2TxFeed, - config: ApiConfig, -) -> ApiServerTask { - let state = Arc::new(ApiState::new( - tx_sender, - domain, - max_user_op_data_bytes, - shutdown.clone(), - tx_feed, - config, - )); - let app = router(state, config.max_body_bytes); - - tokio::spawn(async move { - axum::serve(listener, app) - .with_graceful_shutdown(async move { - shutdown.wait_for_shutdown().await; - }) - .await - }) -} - -fn router(state: Arc, max_body_bytes: usize) -> Router { - Router::new() - .route("/tx", post(tx::submit_tx)) - .route("/ws/subscribe", get(ws::subscribe_l2_txs)) - .with_state(state) - // Enforces a raw request-body cap before JSON deserialization, including whitespace. - .layer(DefaultBodyLimit::max(max_body_bytes)) - .layer(TraceLayer::new_for_http()) -} - -// Keep non-413 JSON extractor failures normalized to 400 for a stable API contract. -fn map_json_rejection(err: axum::extract::rejection::JsonRejection) -> ApiError { - if err.status() == StatusCode::PAYLOAD_TOO_LARGE { - ApiError::payload_too_large(format!("request body too large: {err}")) - } else { - ApiError::bad_request(format!("invalid JSON: {err}")) - } -} diff --git a/sequencer/src/api/tx.rs b/sequencer/src/api/tx.rs deleted file mode 100644 index dad6617..0000000 --- a/sequencer/src/api/tx.rs +++ /dev/null @@ -1,168 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::sync::Arc; -use std::time::SystemTime; - -use axum::extract::{Json, State}; -use tokio::sync::mpsc::error::TrySendError; -use tokio::sync::oneshot; -use tracing::debug; - -use super::{ApiError, ApiState}; -use crate::inclusion_lane::PendingUserOp; -use sequencer_core::api::{TxRequest, TxResponse}; -use sequencer_core::user_op::SignedUserOp; - -pub(super) async fn submit_tx( - State(state): State>, - req: Result, axum::extract::rejection::JsonRejection>, -) -> Result, ApiError> { - let Json(req) = req.map_err(super::map_json_rejection)?; - - let signed = req - .into_signed_user_op(&state.domain, state.max_user_op_data_bytes) - .map_err(ApiError::from)?; - let nonce = signed.user_op.nonce; - let sender = signed.sender; - let ack = enqueue_verified_tx(state.as_ref(), signed)?; - - let commit_result = ack - .await - .map_err(|_| ApiError::internal_error("inclusion lane dropped response"))?; - commit_result.map_err(ApiError::from)?; - debug!(sender = %sender, nonce, "tx committed"); - - Ok(Json(TxResponse { - ok: true, - sender: sender.to_string(), - nonce, - })) -} - -fn enqueue_verified_tx( - state: &ApiState, - signed: SignedUserOp, -) -> Result>, ApiError> { - state.reject_if_shutting_down()?; - - let (respond_to, recv) = oneshot::channel(); - let pending = PendingUserOp { - signed, - respond_to, - received_at: SystemTime::now(), - }; - - match state.tx_sender.try_send(pending) { - Ok(()) => Ok(recv), - Err(TrySendError::Full(_)) => Err(ApiError::overloaded("queue full")), - Err(TrySendError::Closed(_)) => Err(ApiError::internal_error("inclusion lane unavailable")), - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use alloy_primitives::{Address, Signature}; - use alloy_sol_types::Eip712Domain; - use alloy_sol_types::SolStruct; - use axum::http::StatusCode; - use k256::ecdsa::SigningKey; - use k256::ecdsa::signature::hazmat::PrehashSigner; - use std::sync::Arc; - use tempfile::TempDir; - use tokio::sync::mpsc; - - use crate::storage::Storage; - use sequencer_core::user_op::UserOp; - - #[tokio::test(flavor = "current_thread")] - async fn submit_tx_rejects_when_shutdown_has_started() { - let db = TempDir::new().expect("create temp dir"); - let db_path = db.path().join("sequencer.db"); - let _storage = Storage::open(&db_path.to_string_lossy(), "NORMAL").expect("create db"); - let shutdown = crate::shutdown::ShutdownSignal::default(); - let tx_feed = crate::l2_tx_feed::L2TxFeed::new( - db_path.to_string_lossy().into_owned(), - shutdown.clone(), - crate::l2_tx_feed::L2TxFeedConfig { - idle_poll_interval: std::time::Duration::from_millis(2), - page_size: 64, - batch_submitter_address: None, - }, - ); - - shutdown.request_shutdown(); - - let (tx_sender, _rx) = mpsc::channel::(1); - let state = Arc::new(ApiState::new( - tx_sender, - Eip712Domain { - name: None, - version: None, - chain_id: None, - verifying_contract: None, - salt: None, - }, - 128, - shutdown, - tx_feed.clone(), - crate::api::ApiConfig { - max_body_bytes: 128, - ws_max_subscribers: 1, - ws_max_catchup_events: 1, - }, - )); - - let signing_key = SigningKey::from_bytes((&[7_u8; 32]).into()).expect("create signing key"); - let sender = address_from_signing_key(&signing_key); - let user_op = UserOp { - nonce: 0, - max_fee: 0, - data: Vec::new().into(), - }; - let request = TxRequest { - message: user_op.clone(), - signature: sign_user_op_hex(&state.domain, &user_op, &signing_key), - sender: sender.to_string(), - }; - - let result = submit_tx(State(state), Ok(Json(request))).await; - - let err = result.expect_err("submit should be rejected during shutdown"); - assert_eq!(err.status(), StatusCode::SERVICE_UNAVAILABLE); - assert_eq!(err.code(), "UNAVAILABLE"); - } - - fn sign_user_op_hex( - domain: &Eip712Domain, - user_op: &UserOp, - signing_key: &SigningKey, - ) -> String { - let hash = user_op.eip712_signing_hash(domain); - let k256_sig = signing_key - .sign_prehash(hash.as_slice()) - .expect("sign user op hash"); - - let sender = address_from_signing_key(signing_key); - let signature = [false, true] - .into_iter() - .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) - .find(|candidate| { - candidate - .recover_address_from_prehash(&hash) - .ok() - .map(|value| value == sender) - .unwrap_or(false) - }) - .expect("recoverable parity for signature"); - - alloy_primitives::hex::encode_prefixed(signature.as_bytes()) - } - - fn address_from_signing_key(signing_key: &SigningKey) -> Address { - let verifying = signing_key.verifying_key().to_encoded_point(false); - Address::from_raw_public_key(&verifying.as_bytes()[1..]) - } -} diff --git a/sequencer/src/batch_submitter/batch_poster.rs b/sequencer/src/batch_submitter/batch_poster.rs deleted file mode 100644 index 86cb27a..0000000 --- a/sequencer/src/batch_submitter/batch_poster.rs +++ /dev/null @@ -1,219 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use alloy::providers::{DynProvider, Provider}; -use async_trait::async_trait; -use cartesi_rollups_contracts::input_box::InputBox; -use sequencer_core::batch::Batch; -use thiserror::Error; - -use crate::partition::{decode_evm_advance_input, get_input_added_events}; - -pub type TxHash = alloy_primitives::B256; - -#[derive(Debug, Clone)] -pub struct BatchPosterConfig { - pub l1_submit_address: alloy_primitives::Address, - pub app_address: alloy_primitives::Address, - pub batch_submitter_address: alloy_primitives::Address, - pub start_block: u64, - pub confirmation_depth: u64, - /// Error codes that trigger `get_logs` retries with a shorter block range. - pub long_block_range_error_codes: Vec, -} - -#[derive(Debug, Error)] -pub enum BatchPosterError { - #[error("provider/transport: {0}")] - Provider(String), -} - -#[async_trait] -pub trait BatchPoster: Send + Sync { - async fn submit_batch(&self, payload: Vec) -> Result; - - async fn observed_submitted_batch_nonces( - &self, - from_block: u64, - ) -> Result, BatchPosterError>; -} - -#[derive(Clone)] -pub struct EthereumBatchPoster { - provider: DynProvider, - config: BatchPosterConfig, -} - -impl EthereumBatchPoster { - pub fn new(provider: DynProvider, config: BatchPosterConfig) -> Self { - Self { provider, config } - } -} - -#[async_trait] -impl BatchPoster for EthereumBatchPoster { - async fn submit_batch(&self, payload: Vec) -> Result { - let input_box = InputBox::new(self.config.l1_submit_address, &self.provider); - let pending = input_box - .addInput(self.config.app_address, payload.into()) - .send() - .await - .map_err(|err| BatchPosterError::Provider(err.to_string()))?; - let tx_hash = *pending.tx_hash(); - - pending - .with_required_confirmations(self.config.confirmation_depth.saturating_add(1)) - .watch() - .await - .map_err(|err| BatchPosterError::Provider(err.to_string()))?; - - Ok(tx_hash) - } - - async fn observed_submitted_batch_nonces( - &self, - from_block: u64, - ) -> Result, BatchPosterError> { - let latest = self - .provider - .get_block_number() - .await - .map_err(|err| BatchPosterError::Provider(err.to_string()))?; - let end_block = latest.saturating_sub(self.config.confirmation_depth); - let start_block = from_block.max(self.config.start_block); - if start_block > end_block { - return Ok(Vec::new()); - } - - let events = get_input_added_events( - &self.provider, - self.config.app_address, - &self.config.l1_submit_address, - start_block, - end_block, - self.config.long_block_range_error_codes.as_slice(), - ) - .await - .map_err(|errs| { - BatchPosterError::Provider( - errs.into_iter() - .next() - .map(|e| e.to_string()) - .unwrap_or_default(), - ) - })?; - - let mut observed_nonces = Vec::new(); - for (event, _log) in events { - let evm_advance = decode_evm_advance_input(event.input.as_ref()) - .map_err(BatchPosterError::Provider)?; - if evm_advance.msgSender != self.config.batch_submitter_address { - continue; - } - let batch: Batch = ssz::Decode::from_ssz_bytes(evm_advance.payload.as_ref()) - .map_err(|err| BatchPosterError::Provider(format!("{err:?}")))?; - observed_nonces.push(batch.nonce); - } - - Ok(observed_nonces) - } -} - -#[cfg(test)] -pub(crate) mod mock { - use super::{Batch, BatchPoster, BatchPosterError, TxHash}; - use async_trait::async_trait; - use std::sync::Mutex; - - #[derive(Debug)] - pub struct MockBatchPoster { - pub submissions: Mutex>, - pub fail_submit: Mutex, - pub observed_submitted_nonces: Mutex>, - pub observed_submitted_error: Mutex>, - pub last_from_block: Mutex>, - } - - impl MockBatchPoster { - pub fn new() -> Self { - Self { - submissions: Mutex::new(Vec::new()), - fail_submit: Mutex::new(false), - observed_submitted_nonces: Mutex::new(Vec::new()), - observed_submitted_error: Mutex::new(None), - last_from_block: Mutex::new(None), - } - } - - pub fn submissions(&self) -> Vec<(u64, usize)> { - self.submissions.lock().expect("lock").clone() - } - - pub fn set_observed_submitted_nonces(&self, value: Vec) { - *self.observed_submitted_nonces.lock().expect("lock") = value; - } - - pub fn set_observed_submitted_error(&self, value: Option<&str>) { - *self.observed_submitted_error.lock().expect("lock") = value.map(str::to_string); - } - - pub fn last_from_block(&self) -> Option { - *self.last_from_block.lock().expect("lock") - } - } - - #[async_trait] - impl BatchPoster for MockBatchPoster { - async fn submit_batch(&self, payload: Vec) -> Result { - if *self.fail_submit.lock().expect("lock") { - return Err(BatchPosterError::Provider("mock submit fail".into())); - } - let batch_index = ssz::Decode::from_ssz_bytes(payload.as_ref()) - .map(|b: Batch| b.nonce) - .unwrap_or(0); - self.submissions - .lock() - .expect("lock") - .push((batch_index, payload.len())); - Ok(TxHash::ZERO) - } - - async fn observed_submitted_batch_nonces( - &self, - from_block: u64, - ) -> Result, BatchPosterError> { - *self.last_from_block.lock().expect("lock") = Some(from_block); - if let Some(err) = self.observed_submitted_error.lock().expect("lock").clone() { - return Err(BatchPosterError::Provider(err)); - } - let configured = self.observed_submitted_nonces.lock().expect("lock").clone(); - if !configured.is_empty() { - return Ok(configured); - } - Ok(self - .submissions - .lock() - .expect("lock") - .iter() - .map(|(idx, _)| *idx) - .collect()) - } - } -} - -#[cfg(test)] -mod tests { - use super::{BatchPoster, mock::MockBatchPoster}; - - #[tokio::test] - async fn mock_poster_tracks_requested_suffix_start_block() { - let poster = MockBatchPoster::new(); - let observed = poster - .observed_submitted_batch_nonces(42) - .await - .expect("observe submitted batches"); - - assert!(observed.is_empty()); - assert_eq!(poster.last_from_block(), Some(42)); - } -} diff --git a/sequencer/src/batch_submitter/mod.rs b/sequencer/src/batch_submitter/mod.rs deleted file mode 100644 index 7b33556..0000000 --- a/sequencer/src/batch_submitter/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -//! Batch submitter: posts closed batches to L1 with at-least-once semantics. -//! -//! The batch index is used as the batch nonce (id). The scheduler checks that nonces are -//! strictly increasing and invalidates otherwise, so duplicates are deduplicated at the -//! scheduler level. See `worker` for the wake → read S → compare → submit → sleep loop. - -mod batch_poster; -mod config; -mod worker; - -pub use batch_poster::{ - BatchPoster, BatchPosterConfig, BatchPosterError, EthereumBatchPoster, TxHash, -}; -pub use config::BatchSubmitterConfig; -pub use worker::{BatchSubmitter, BatchSubmitterError, TickOutcome}; diff --git a/sequencer/src/batch_submitter/worker.rs b/sequencer/src/batch_submitter/worker.rs deleted file mode 100644 index b5a79cc..0000000 --- a/sequencer/src/batch_submitter/worker.rs +++ /dev/null @@ -1,382 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -//! Batch submitter worker: at-least-once submission to L1, deduplicated by the scheduler. -//! -//! The worker is intentionally stateless with respect to submitted-batch progress. -//! On each tick it derives the highest submitted batch nonce from L1, compares that -//! with locally closed batches, submits the first missing batch if any, then loops. - -use std::sync::Arc; -use std::time::Duration; - -use alloy_primitives::Address; -use thiserror::Error; -use tracing::{debug, info, warn}; - -use crate::batch_submitter::{BatchPoster, BatchPosterError, BatchSubmitterConfig, TxHash}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{Storage, StorageOpenError}; - -#[derive(Debug, Error)] -pub enum BatchSubmitterError { - #[error(transparent)] - OpenStorage(#[from] StorageOpenError), - #[error(transparent)] - Storage(#[from] rusqlite::Error), - #[error("batch submitter join error: {0}")] - Join(String), - #[error(transparent)] - Poster(#[from] BatchPosterError), -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TickOutcome { - Idle, - Submitted { batch_index: u64, tx_hash: TxHash }, -} - -pub struct BatchSubmitter { - db_path: String, - batch_submitter_address: Address, - poster: Arc

, - idle_poll_interval: Duration, - shutdown: ShutdownSignal, -} - -impl BatchSubmitter

{ - pub fn new( - db_path: impl Into, - batch_submitter_address: Address, - poster: Arc

, - shutdown: ShutdownSignal, - config: BatchSubmitterConfig, - ) -> Self { - Self { - db_path: db_path.into(), - batch_submitter_address, - poster, - idle_poll_interval: config.idle_poll_interval(), - shutdown, - } - } - - pub fn start( - self, - ) -> Result>, StorageOpenError> { - let _ = Storage::open_read_only(self.db_path.as_str())?; - Ok(tokio::spawn(async move { self.run_forever().await })) - } - - async fn run_forever(self) -> Result<(), BatchSubmitterError> { - loop { - if self.shutdown.is_shutdown_requested() { - return Ok(()); - } - - match self.tick_once().await { - Ok(TickOutcome::Submitted { .. }) => continue, - Ok(TickOutcome::Idle) => {} - Err(BatchSubmitterError::Poster(source)) => { - warn!(error = %source, "batch submitter tick failed, will retry"); - } - Err(err) => return Err(err), - } - - tokio::select! { - _ = self.shutdown.wait_for_shutdown() => return Ok(()), - _ = tokio::time::sleep(self.idle_poll_interval) => {} - } - } - } - - pub(crate) async fn tick_once(&self) -> Result { - let latest_batch_opt = self.load_latest_batch_index().await?; - let Some(latest_batch_index) = latest_batch_opt else { - return Ok(TickOutcome::Idle); - }; - - if latest_batch_index == 0 { - return Ok(TickOutcome::Idle); - } - - let last_closed = latest_batch_index - 1; - let next_expected = { - let (safe_block, safe_next_expected) = - self.load_safe_next_expected_batch_nonce().await?; - - let recent_observed_nonces = self - .poster - .observed_submitted_batch_nonces(safe_block.saturating_add(1)) - .await?; - advance_expected_batch_nonce(safe_next_expected, recent_observed_nonces) - }; - let latest_submitted = next_expected.checked_sub(1); - let first_to_submit = latest_submitted.map(|s| s + 1).unwrap_or(0); - if first_to_submit > last_closed { - return Ok(TickOutcome::Idle); - } - if first_to_submit < last_closed { - let pending_batches = last_closed - first_to_submit + 1; - warn!( - first_to_submit, - last_closed, pending_batches, "multiple closed batches are pending submission" - ); - } - - let batch = self.load_batch_for_submission(first_to_submit).await?; - debug!(batch_index = first_to_submit, "submitting batch to L1"); - let tx_hash = self - .poster - .submit_batch(batch.encode_for_scheduler()) - .await?; - info!(batch_index = first_to_submit, %tx_hash, "batch submitted to L1"); - - Ok(TickOutcome::Submitted { - batch_index: first_to_submit, - tx_hash, - }) - } - - async fn load_latest_batch_index(&self) -> Result, BatchSubmitterError> { - let db_path = self.db_path.clone(); - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open_read_only(&db_path)?; - storage - .latest_batch_index() - .map_err(BatchSubmitterError::from) - }) - .await - .map_err(|err| BatchSubmitterError::Join(err.to_string()))? - } - - const SAFE_NONCE_PAGE_SIZE: u64 = 256; - - async fn load_safe_next_expected_batch_nonce(&self) -> Result<(u64, u64), BatchSubmitterError> { - let db_path = self.db_path.clone(); - let batch_submitter_address = self.batch_submitter_address; - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open_read_only(&db_path)?; - storage - .advance_safe_batch_nonce_for_sender( - batch_submitter_address, - Self::SAFE_NONCE_PAGE_SIZE, - ) - .map_err(BatchSubmitterError::from) - }) - .await - .map_err(|err| BatchSubmitterError::Join(err.to_string()))? - } - - async fn load_batch_for_submission( - &self, - batch_index: u64, - ) -> Result { - let db_path = self.db_path.clone(); - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open_read_only(&db_path)?; - storage - .load_batch_for_submission(batch_index) - .map_err(BatchSubmitterError::from) - }) - .await - .map_err(|err| BatchSubmitterError::Join(err.to_string()))? - } -} - -fn advance_expected_batch_nonce( - mut expected: u64, - observed_nonces: impl IntoIterator, -) -> u64 { - for nonce in observed_nonces { - if nonce == expected { - expected = expected.saturating_add(1); - } - } - expected -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use alloy_primitives::Address; - - use crate::batch_submitter::{ - BatchSubmitterConfig, BatchSubmitterError, TickOutcome, batch_poster::mock::MockBatchPoster, - }; - use crate::shutdown::ShutdownSignal; - use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; - use tempfile::TempDir; - - const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; - const BATCH_SUBMITTER_ADDRESS: Address = Address::repeat_byte(0x11); - - fn temp_db(name: &str) -> (TempDir, String) { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-batch-submitter-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - (dir, path.to_string_lossy().into_owned()) - } - - fn seed_two_closed_batches(db_path: &str) { - let mut storage = Storage::open(db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let next_safe = head.safe_block; - storage - .close_frame_and_batch(&mut head, next_safe) - .expect("close batch 0"); - storage - .close_frame_and_batch(&mut head, next_safe) - .expect("close batch 1"); - storage - .close_frame_and_batch(&mut head, next_safe) - .expect("close batch 2"); - } - - fn seed_safe_submitted_batches(db_path: &str, safe_block: u64, nonces: &[u64]) { - let mut storage = Storage::open(db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); - let inputs: Vec<_> = nonces - .iter() - .map(|nonce| StoredSafeInput { - sender: BATCH_SUBMITTER_ADDRESS, - payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { - nonce: *nonce, - frames: Vec::new(), - }), - block_number: safe_block, - }) - .collect(); - storage - .append_safe_inputs(safe_block, inputs.as_slice()) - .expect("append safe submitted batches"); - } - - #[tokio::test] - async fn tick_once_submits_first_missing_closed_batch() { - let (_dir, path) = temp_db("tick-submits"); - seed_two_closed_batches(&path); - - let mock = Arc::new(MockBatchPoster::new()); - let config = BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }; - let submitter = super::BatchSubmitter::new( - path.clone(), - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - ShutdownSignal::default(), - config, - ); - - let outcome = submitter.tick_once().await.expect("tick once"); - assert_eq!( - outcome, - TickOutcome::Submitted { - batch_index: 0, - tx_hash: alloy_primitives::B256::ZERO - } - ); - - let submissions = mock.submissions(); - assert_eq!(submissions.len(), 1); - assert_eq!(submissions[0].0, 0); - } - - #[tokio::test] - async fn tick_once_submits_nothing_when_already_caught_up() { - let (_dir, path) = temp_db("tick-caught-up"); - seed_two_closed_batches(&path); - seed_safe_submitted_batches(&path, 10, &[0, 1]); - - let mock = Arc::new(MockBatchPoster::new()); - mock.set_observed_submitted_nonces(vec![2]); - let config = BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }; - let submitter = super::BatchSubmitter::new( - path.clone(), - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - ShutdownSignal::default(), - config, - ); - - let outcome = submitter.tick_once().await.expect("tick once"); - assert_eq!(outcome, TickOutcome::Idle); - assert!(mock.submissions().is_empty()); - assert_eq!(mock.last_from_block(), Some(11)); - } - - #[tokio::test] - async fn tick_once_combines_safe_prefix_with_recent_chain_suffix() { - let (_dir, path) = temp_db("tick-combines-prefix-and-suffix"); - seed_two_closed_batches(&path); - seed_safe_submitted_batches(&path, 10, &[0]); - - let mock = Arc::new(MockBatchPoster::new()); - mock.set_observed_submitted_nonces(vec![1]); - let submitter = super::BatchSubmitter::new( - path.clone(), - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - ShutdownSignal::default(), - BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }, - ); - - let outcome = submitter.tick_once().await.expect("tick once"); - assert_eq!( - outcome, - TickOutcome::Submitted { - batch_index: 2, - tx_hash: alloy_primitives::B256::ZERO - } - ); - assert_eq!(mock.last_from_block(), Some(11)); - } - - #[tokio::test] - async fn tick_once_propagates_poster_errors() { - let (_dir, path) = temp_db("tick-poster-error"); - seed_two_closed_batches(&path); - - let mock = Arc::new(MockBatchPoster::new()); - mock.set_observed_submitted_error(Some("rpc fail")); - let submitter = super::BatchSubmitter::new( - path, - BATCH_SUBMITTER_ADDRESS, - mock, - ShutdownSignal::default(), - BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }, - ); - - let err = submitter - .tick_once() - .await - .expect_err("poster error should propagate"); - assert!(matches!(err, BatchSubmitterError::Poster(_))); - } - - #[test] - fn advance_expected_batch_nonce_matches_scheduler_nonce_rule() { - assert_eq!(super::advance_expected_batch_nonce(0, Vec::::new()), 0); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 2]), 3); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 3]), 1); - assert_eq!(super::advance_expected_batch_nonce(0, vec![1, 2, 3]), 0); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 1, 2]), 3); - assert_eq!( - super::advance_expected_batch_nonce(0, vec![6, 4, 3, 2, 2, 0, 1]), - 2 - ); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 1]), 2); - assert_eq!(super::advance_expected_batch_nonce(2, vec![2, 3]), 4); - } -} diff --git a/sequencer/src/egress/api/health.rs b/sequencer/src/egress/api/health.rs new file mode 100644 index 0000000..ac783c7 --- /dev/null +++ b/sequencer/src/egress/api/health.rs @@ -0,0 +1,116 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Health probes (k8s-style): +//! +//! - `GET /livez` — process is up. Always 200. +//! - `GET /readyz` — ready to accept new transactions. 503 if shutdown is in +//! progress or the inclusion lane has dropped its receiver. +//! - `GET /healthz` — JSON status report. 200 / 503 mirroring `/readyz`. +//! +//! Lives on egress because operators (and kubelet, in practice) probe from the +//! internal-cluster side. + +use std::sync::Arc; + +use axum::Json; +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::IntoResponse; +use serde::Serialize; +use tokio::sync::mpsc; + +use crate::ingress::inclusion_lane::PendingUserOp; +use crate::runtime::shutdown::ShutdownSignal; + +/// Narrow health-check state. Holds only the signals the probes inspect; the +/// `tx_sender` is a clone of the inclusion-lane channel and is closed iff the +/// lane has dropped its receiver. +#[derive(Clone)] +pub(crate) struct HealthState { + pub tx_sender: mpsc::Sender, + pub shutdown: ShutdownSignal, +} + +#[derive(Serialize)] +struct HealthStatus { + status: &'static str, + inclusion_lane: &'static str, +} + +pub(crate) async fn livez() -> StatusCode { + StatusCode::OK +} + +pub(crate) async fn readyz(State(state): State>) -> StatusCode { + if state.shutdown.is_shutdown_requested() || state.tx_sender.is_closed() { + StatusCode::SERVICE_UNAVAILABLE + } else { + StatusCode::OK + } +} + +pub(crate) async fn healthz(State(state): State>) -> impl IntoResponse { + let lane_ok = !state.tx_sender.is_closed(); + let shutting_down = state.shutdown.is_shutdown_requested(); + let all_ok = lane_ok && !shutting_down; + + let body = HealthStatus { + status: if all_ok { "ok" } else { "degraded" }, + inclusion_lane: if lane_ok { "ok" } else { "stopped" }, + }; + + let status = if all_ok { + StatusCode::OK + } else { + StatusCode::SERVICE_UNAVAILABLE + }; + (status, Json(body)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fresh_state() -> (Arc, mpsc::Receiver) { + let (tx_sender, rx) = mpsc::channel::(1); + let state = Arc::new(HealthState { + tx_sender, + shutdown: ShutdownSignal::default(), + }); + (state, rx) + } + + #[tokio::test] + async fn livez_is_always_ok() { + assert_eq!(livez().await, StatusCode::OK); + } + + #[tokio::test] + async fn readyz_is_ok_when_lane_alive_and_not_shutting_down() { + let (state, _rx) = fresh_state(); + assert_eq!(readyz(State(state)).await, StatusCode::OK); + } + + #[tokio::test] + async fn readyz_is_unavailable_when_shutdown_requested() { + let (state, _rx) = fresh_state(); + state.shutdown.request_shutdown(); + assert_eq!(readyz(State(state)).await, StatusCode::SERVICE_UNAVAILABLE); + } + + #[tokio::test] + async fn readyz_is_unavailable_when_lane_dropped() { + let (state, rx) = fresh_state(); + drop(rx); + assert_eq!(readyz(State(state)).await, StatusCode::SERVICE_UNAVAILABLE); + } + + #[tokio::test] + async fn healthz_reports_lane_stopped_after_lane_drop() { + let (state, rx) = fresh_state(); + drop(rx); + let response = healthz(State(state)).await.into_response(); + assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE); + } +} diff --git a/sequencer/src/egress/api/mod.rs b/sequencer/src/egress/api/mod.rs new file mode 100644 index 0000000..5d112ea --- /dev/null +++ b/sequencer/src/egress/api/mod.rs @@ -0,0 +1,36 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Egress HTTP API routes: WebSocket subscribe + k8s-style health probes. +//! Additional read endpoints will land here. + +mod health; +mod state; +mod subscribe; + +use std::sync::Arc; + +use axum::Router; +use axum::routing::get; + +pub(crate) use health::HealthState; +pub(crate) use state::SubscribeState; + +/// Build the egress router. Each subrouter has its own state; the merge is +/// transparent to axum's routing. +pub(crate) fn router( + subscribe_state: Arc, + health_state: Arc, +) -> Router { + let subscribe_router = Router::new() + .route("/ws/subscribe", get(subscribe::subscribe_l2_txs)) + .with_state(subscribe_state); + + let health_router = Router::new() + .route("/livez", get(health::livez)) + .route("/readyz", get(health::readyz)) + .route("/healthz", get(health::healthz)) + .with_state(health_state); + + subscribe_router.merge(health_router) +} diff --git a/sequencer/src/api/state.rs b/sequencer/src/egress/api/state.rs similarity index 55% rename from sequencer/src/api/state.rs rename to sequencer/src/egress/api/state.rs index 752e254..de15396 100644 --- a/sequencer/src/api/state.rs +++ b/sequencer/src/egress/api/state.rs @@ -1,43 +1,36 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Egress-side axum state — feeds the WS subscribe handler today; will grow as +//! more egress routes are added. + use std::sync::Arc; -use alloy_sol_types::Eip712Domain; -use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; -use super::{ApiConfig, ApiError}; -use crate::inclusion_lane::PendingUserOp; -use crate::l2_tx_feed::L2TxFeed; -use crate::shutdown::ShutdownSignal; +use crate::egress::l2_tx_feed::L2TxFeed; +use crate::http::ApiError; +use crate::runtime::shutdown::ShutdownSignal; #[derive(Clone)] -pub(super) struct ApiState { - pub tx_sender: mpsc::Sender, - pub domain: Eip712Domain, - pub max_user_op_data_bytes: usize, +pub(crate) struct SubscribeState { pub shutdown: ShutdownSignal, pub ws_subscriber_limit: Arc, pub ws_max_catchup_events: u64, pub tx_feed: L2TxFeed, } -impl ApiState { - pub(super) fn new( - tx_sender: mpsc::Sender, - domain: Eip712Domain, - max_user_op_data_bytes: usize, +impl SubscribeState { + pub(crate) fn new( shutdown: ShutdownSignal, tx_feed: L2TxFeed, - config: ApiConfig, + ws_max_subscribers: usize, + ws_max_catchup_events: u64, ) -> Self { Self { - tx_sender, - domain, - max_user_op_data_bytes, shutdown, - ws_subscriber_limit: Arc::new(Semaphore::new(config.ws_max_subscribers)), - ws_max_catchup_events: config.ws_max_catchup_events, + ws_subscriber_limit: Arc::new(Semaphore::new(ws_max_subscribers)), + ws_max_catchup_events, tx_feed, } } diff --git a/sequencer/src/api/ws.rs b/sequencer/src/egress/api/subscribe.rs similarity index 77% rename from sequencer/src/api/ws.rs rename to sequencer/src/egress/api/subscribe.rs index 23aacf8..897f73f 100644 --- a/sequencer/src/api/ws.rs +++ b/sequencer/src/egress/api/subscribe.rs @@ -1,6 +1,10 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! `GET /ws/subscribe` — replay-then-live stream of ordered L2 txs. +//! Acquires a subscriber permit before upgrading; permit is held for the +//! lifetime of the session and released on disconnect via `Drop`. + use std::sync::Arc; use axum::extract::ws::{CloseFrame, Message, WebSocket, WebSocketUpgrade, close_code}; @@ -10,20 +14,21 @@ use serde::Deserialize; use tokio::sync::OwnedSemaphorePermit; use tracing::warn; -use crate::l2_tx_feed::{BroadcastTxMessage, L2TxFeed, SubscribeError}; +use crate::egress::l2_tx_feed::{BroadcastTxMessage, L2TxFeed, SubscribeError}; +use crate::http::WS_CATCHUP_WINDOW_EXCEEDED_REASON; -use super::{ApiState, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; +use super::SubscribeState; const MAX_INBOUND_WS_MESSAGE_SIZE: usize = 8 * 1024; const MAX_INBOUND_WS_FRAME_SIZE: usize = 8 * 1024; #[derive(Debug, Deserialize)] -pub(super) struct SubscribeQuery { +pub(crate) struct SubscribeQuery { from_offset: Option, } -pub(super) async fn subscribe_l2_txs( - State(state): State>, +pub(crate) async fn subscribe_l2_txs( + State(state): State>, Query(query): Query, ws: WebSocketUpgrade, ) -> Response { @@ -67,32 +72,22 @@ async fn run_ws_session( max_catchup_events, "ws catch-up window exceeded; closing subscriber" ); - let _ = socket - .send(Message::Close(Some(CloseFrame { - code: close_code::POLICY, - reason: WS_CATCHUP_WINDOW_EXCEEDED_REASON.into(), - }))) - .await; + close_with_frame( + &mut socket, + close_code::POLICY, + WS_CATCHUP_WINDOW_EXCEEDED_REASON, + ) + .await; return; } Err(SubscribeError::OpenStorage { source }) => { warn!(error = %source, "ws subscription failed to open replay storage"); - let _ = socket - .send(Message::Close(Some(CloseFrame { - code: close_code::ERROR, - reason: "subscription unavailable".into(), - }))) - .await; + close_with_frame(&mut socket, close_code::ERROR, "subscription unavailable").await; return; } Err(SubscribeError::LoadHeadOffset { source }) => { warn!(error = %source, "ws subscription failed to read replay head"); - let _ = socket - .send(Message::Close(Some(CloseFrame { - code: close_code::ERROR, - reason: "subscription unavailable".into(), - }))) - .await; + close_with_frame(&mut socket, close_code::ERROR, "subscription unavailable").await; return; } }; @@ -127,6 +122,15 @@ async fn run_ws_session( } } +async fn close_with_frame(socket: &mut WebSocket, code: u16, reason: &str) { + let _ = socket + .send(Message::Close(Some(CloseFrame { + code, + reason: reason.into(), + }))) + .await; +} + async fn send_ws_event(socket: &mut WebSocket, event: &BroadcastTxMessage) -> Result<(), ()> { let payload = match serde_json::to_string(event) { Ok(value) => value, diff --git a/sequencer/src/l2_tx_feed/error.rs b/sequencer/src/egress/l2_tx_feed/error.rs similarity index 100% rename from sequencer/src/l2_tx_feed/error.rs rename to sequencer/src/egress/l2_tx_feed/error.rs diff --git a/sequencer/src/l2_tx_feed/feed.rs b/sequencer/src/egress/l2_tx_feed/mod.rs similarity index 78% rename from sequencer/src/l2_tx_feed/feed.rs rename to sequencer/src/egress/l2_tx_feed/mod.rs index 15c5e49..c7cf616 100644 --- a/sequencer/src/l2_tx_feed/feed.rs +++ b/sequencer/src/egress/l2_tx_feed/mod.rs @@ -1,15 +1,23 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! DB-backed ordered-L2-tx feed used by WS subscriptions and catch-up replay. + +mod error; + +#[cfg(test)] +mod tests; + +pub use error::{SubscribeError, SubscriptionError}; +pub use sequencer_core::broadcast::BroadcastTxMessage; + use std::time::Duration; use alloy_primitives::Address; -pub use sequencer_core::broadcast::BroadcastTxMessage; use sequencer_core::l2_tx::SequencedL2Tx; use tokio::sync::mpsc; -use super::{SubscribeError, SubscriptionError}; -use crate::shutdown::ShutdownSignal; +use crate::runtime::shutdown::ShutdownSignal; use crate::storage::Storage; #[derive(Debug, Clone, Copy)] @@ -66,8 +74,12 @@ impl L2TxFeed { from_offset: u64, max_catchup_events: u64, ) -> Result { - let head_offset = load_head_offset(self.db_path.as_str())?; - let catchup_events = head_offset.saturating_sub(from_offset); + let (head_offset, catchup_events) = load_catchup_info( + self.db_path.as_str(), + from_offset, + max_catchup_events, + self.batch_submitter_address, + )?; if catchup_events > max_catchup_events { return Err(SubscribeError::CatchUpWindowExceeded { requested_offset: from_offset, @@ -126,12 +138,29 @@ impl Subscription { } } -fn load_head_offset(db_path: &str) -> Result { +/// Returns `(head_offset, broadcastable_event_count_after_from_offset)`. +/// +/// Counts events the client will actually receive — excludes invalidated batches +/// and batch-submitter direct inputs (which are filtered before WS delivery). +fn load_catchup_info( + db_path: &str, + from_offset: u64, + max_catchup_events: u64, + batch_submitter_address: Option

, +) -> Result<(u64, u64), SubscribeError> { let mut storage = Storage::open_read_only(db_path) .map_err(|source| SubscribeError::OpenStorage { source })?; - storage - .ordered_l2_tx_count() - .map_err(|source| SubscribeError::LoadHeadOffset { source }) + let head_offset = storage + .ordered_l2_tx_head_offset() + .map_err(|source| SubscribeError::LoadHeadOffset { source })?; + let catchup_count = storage + .count_broadcastable_events_after( + from_offset, + max_catchup_events.saturating_add(1), + batch_submitter_address, + ) + .map_err(|source| SubscribeError::LoadHeadOffset { source })?; + Ok((head_offset, catchup_count)) } fn run_subscription( @@ -153,7 +182,7 @@ fn run_subscription( } let txs = storage - .load_ordered_l2_txs_page_from(next_offset, page_size) + .ordered_l2_txs_page_from(next_offset, page_size) .map_err(|source| SubscriptionError::LoadReplay { offset: next_offset, source, @@ -164,18 +193,18 @@ fn run_subscription( continue; } - for tx in txs { + for (db_offset, tx) in txs { if shutdown.is_shutdown_requested() || events_tx.is_closed() { return Ok(()); } + next_offset = db_offset; + if should_filter_from_broadcast(&tx, batch_submitter_address) { - next_offset = next_offset.saturating_add(1); continue; } - let event = BroadcastTxMessage::from_offset_and_tx(next_offset, tx); - next_offset = next_offset.saturating_add(1); + let event = BroadcastTxMessage::from_offset_and_tx(db_offset, tx); if events_tx.blocking_send(event).is_err() { return Ok(()); } diff --git a/sequencer/src/egress/l2_tx_feed/tests.rs b/sequencer/src/egress/l2_tx_feed/tests.rs new file mode 100644 index 0000000..3f996a4 --- /dev/null +++ b/sequencer/src/egress/l2_tx_feed/tests.rs @@ -0,0 +1,379 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +use std::time::{Duration, SystemTime}; + +use alloy_primitives::{Address, Signature}; +use tokio::sync::oneshot; + +use super::{BroadcastTxMessage, L2TxFeed, L2TxFeedConfig, SubscribeError}; +use crate::ingress::inclusion_lane::{PendingUserOp, SequencerError}; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::test_helpers::temp_db; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; +use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; +use sequencer_core::user_op::UserOp; + +#[test] +fn broadcast_user_op_serializes_with_hex_data() { + let msg = BroadcastTxMessage::from_offset_and_tx( + 7, + SequencedL2Tx::UserOp(ValidUserOp { + sender: Address::from_slice(&[0x11; 20]), + fee: 3, + data: vec![0xaa, 0xbb], + }), + ); + let json = serde_json::to_string(&msg).expect("serialize"); + assert!(json.contains("\"kind\":\"user_op\"")); + assert!(json.contains("\"offset\":7")); + assert!(json.contains("\"fee\":3")); + assert!(json.contains("\"data\":\"0xaabb\"")); +} + +#[test] +fn broadcast_direct_input_serializes_with_hex_payload() { + let msg = BroadcastTxMessage::from_offset_and_tx( + 9, + SequencedL2Tx::Direct(DirectInput { + sender: Address::ZERO, + block_number: 42, + payload: vec![0xcc, 0xdd], + }), + ); + let json = serde_json::to_string(&msg).expect("serialize"); + assert!(json.contains("\"kind\":\"direct_input\"")); + assert!(json.contains("\"offset\":9")); + assert!(json.contains("\"sender\":\"0x0000000000000000000000000000000000000000\"")); + assert!(json.contains("\"block_number\":42")); + assert!(json.contains("\"payload\":\"0xccdd\"")); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscribe_from_rejects_catchup_window() { + let db = temp_db("catchup-window"); + seed_ordered_txs(db.path.as_str()); + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + + let result = feed.subscribe_from(0, 1); + + assert!(matches!( + result, + Err(SubscribeError::CatchUpWindowExceeded { + requested_offset: 0, + live_start_offset: 2, + max_catchup_events: 1, + }) + )); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscribe_from_accepts_exact_catchup_window() { + let db = temp_db("catchup-window-exact"); + seed_ordered_txs(db.path.as_str()); + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + + let subscription = feed.subscribe_from(0, 2); + + assert!( + subscription.is_ok(), + "exactly 2 replayable events should be allowed" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscription_replays_existing_rows_in_order() { + let db = temp_db("replay-existing"); + seed_ordered_txs(db.path.as_str()); + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + + let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); + + let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait first event") + .expect("first event"); + let second = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait second event") + .expect("second event"); + + // DB offsets (SQLite rowid) start at 1. + assert_eq!(first.offset(), 1); + assert_eq!(second.offset(), 2); + + subscription.finish().await.expect("finish subscription"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscription_filters_batch_submitter_safe_inputs() { + let db = temp_db("filters-batch-submitter-inputs"); + let batch_submitter_address = Address::from([0xfe; 20]); + seed_ordered_txs_with_sender(db.path.as_str(), batch_submitter_address); + let feed = L2TxFeed::new( + db.path.clone(), + ShutdownSignal::default(), + L2TxFeedConfig { + idle_poll_interval: Duration::from_millis(2), + page_size: 64, + batch_submitter_address: Some(batch_submitter_address), + }, + ); + + let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); + let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait first event") + .expect("first event"); + + // DB offsets start at 1. The user op is the first sequenced tx (offset=1), + // and the batch submitter's safe input (offset=2) is filtered out. + assert!(matches!( + first, + BroadcastTxMessage::UserOp { offset: 1, .. } + )); + + let no_second = tokio::time::timeout(Duration::from_millis(50), subscription.recv()).await; + assert!( + no_second.is_err(), + "filtered batch-submitter input should not be broadcast" + ); + + subscription.finish().await.expect("finish subscription"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shutdown_signal_closes_subscription() { + let db = temp_db("shutdown-closes"); + seed_ordered_txs(db.path.as_str()); + let shutdown = ShutdownSignal::default(); + let feed = test_feed(db.path.as_str(), shutdown.clone()); + + let mut subscription = feed.subscribe_from(u64::MAX, u64::MAX).expect("subscribe"); + + shutdown.request_shutdown(); + + assert!( + tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait for subscription close") + .is_none() + ); + subscription.finish().await.expect("clean shutdown"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn catchup_window_not_inflated_by_invalidated_batch_holes() { + // Regression test: after batch invalidation, offset holes in sequenced_l2_txs + // must not inflate the catch-up event count. The check should count actual + // valid events, not subtract rowids. + let db = temp_db("catchup-holes"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // Create two closed batches, each with one direct input. + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .append_safe_inputs( + 10, + &[StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, + ) + .expect("append direct 0"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 20, + }], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, + ) + .expect("append direct 1"); + storage + .close_frame_only(&mut head, 20, SafeInputRange::new(1, 2)) + .expect("close frame"); + drop(storage); + + // Before invalidation: 2 valid events. + // With max_catchup_events=1, subscribing from 0 should fail. + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + assert!( + feed.subscribe_from(0, 1).is_err(), + "should reject: 2 valid events > max 1" + ); + + // Invalidate batch 0 — this creates a hole in the offset space. + // Now only 1 valid event remains (from batch 1). + let mut storage = Storage::open(db.path.as_str()).expect("reopen storage"); + storage.insert_invalid_batch(0).expect("invalidate batch 0"); + drop(storage); + + // After invalidation: only 1 valid event, so max_catchup_events=1 should succeed. + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + assert!( + feed.subscribe_from(0, 1).is_ok(), + "should accept: only 1 valid event after invalidation, despite rowid hole" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn catchup_window_excludes_batch_submitter_direct_inputs() { + // Regression test: batch-submitter direct inputs are filtered before WS + // delivery, so the catch-up window must not count them. Otherwise a + // reconnecting client could be rejected even when the number of + // replayable messages is within the limit. + let db = temp_db("catchup-submitter-filter"); + let batch_submitter = Address::from([0xfe; 20]); + let user_address = Address::from([0x01; 20]); + + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + + // Two direct inputs: one from the batch submitter, one from a user. + storage + .append_safe_inputs( + 10, + &[ + StoredSafeInput { + sender: batch_submitter, + payload: vec![0xaa], + block_number: 10, + }, + StoredSafeInput { + sender: user_address, + payload: vec![0xbb], + block_number: 10, + }, + ], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, + ) + .expect("append directs"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 2)) + .expect("close frame"); + drop(storage); + + // Without batch_submitter_address filtering: 2 events, max=1 should reject. + let feed_no_filter = L2TxFeed::new( + db.path.clone(), + ShutdownSignal::default(), + L2TxFeedConfig { + batch_submitter_address: None, + ..L2TxFeedConfig::default() + }, + ); + assert!( + feed_no_filter.subscribe_from(0, 1).is_err(), + "without filter: 2 events > max 1" + ); + + // With batch_submitter_address filtering: only the user's event counts. + let feed_filtered = L2TxFeed::new( + db.path.clone(), + ShutdownSignal::default(), + L2TxFeedConfig { + batch_submitter_address: Some(batch_submitter), + ..L2TxFeedConfig::default() + }, + ); + assert!( + feed_filtered.subscribe_from(0, 1).is_ok(), + "with filter: only 1 broadcastable event, should accept" + ); +} + +fn test_feed(db_path: &str, shutdown: ShutdownSignal) -> L2TxFeed { + L2TxFeed::new( + db_path.to_string(), + shutdown, + L2TxFeedConfig { + idle_poll_interval: Duration::from_millis(2), + page_size: 64, + batch_submitter_address: None, + }, + ) +} + +fn seed_ordered_txs(db_path: &str) { + seed_ordered_txs_with_sender(db_path, Address::ZERO); +} + +fn seed_ordered_txs_with_sender(db_path: &str, direct_sender: Address) { + let mut storage = Storage::open(db_path).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + let (respond_to, _recv) = oneshot::channel::>(); + let pending = PendingUserOp { + signed: sequencer_core::user_op::SignedUserOp { + sender: Address::from_slice(&[0x11; 20]), + signature: Signature::test_signature(), + user_op: UserOp { + nonce: 0, + max_fee: 3, + data: vec![0x42].into(), + }, + }, + respond_to, + received_at: SystemTime::now(), + }; + + storage + .append_user_ops_chunk(&mut head, &[pending]) + .expect("append user-op chunk"); + storage + .append_safe_inputs( + 10, + &[StoredSafeInput { + sender: direct_sender, + payload: vec![0xaa], + block_number: 10, + }], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, + ) + .expect("append direct input"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame with one drained direct input"); +} diff --git a/sequencer/src/egress/mod.rs b/sequencer/src/egress/mod.rs new file mode 100644 index 0000000..ac7b75a --- /dev/null +++ b/sequencer/src/egress/mod.rs @@ -0,0 +1,9 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Outbound side: WS subscribe (today), future read-only endpoints, and the +//! L2-tx feed that backs them. Operated for internal indexers; the future api +//! split puts these on a separate port from ingress. + +pub mod api; +pub mod l2_tx_feed; diff --git a/sequencer/src/http.rs b/sequencer/src/http.rs new file mode 100644 index 0000000..3ee56f1 --- /dev/null +++ b/sequencer/src/http.rs @@ -0,0 +1,231 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared HTTP surface: error type + JSON response shape used by both +//! ingress (`/tx`) and egress (`/ws/subscribe`, future routes), plus the +//! `axum::serve` orchestration that wires the two side routers together. +//! +//! Today both sides serve from one listener; the planned api split puts each +//! side on its own port (same binary, two listeners). When that lands, the +//! orchestration here becomes per-side `start_*` calls. + +use std::io; +use std::sync::Arc; + +use alloy_sol_types::Eip712Domain; +use axum::Json; +use axum::Router; +use axum::extract::DefaultBodyLimit; +use axum::http::StatusCode; +use axum::response::{IntoResponse, Response}; +use serde::Serialize; +use thiserror::Error; +use tokio::sync::mpsc; +use tower_http::trace::TraceLayer; + +use crate::egress::api::SubscribeState; +use crate::egress::l2_tx_feed::L2TxFeed; +use crate::ingress::api::SubmitState; +use crate::ingress::inclusion_lane::{PendingUserOp, SequencerError}; +use crate::runtime::shutdown::ShutdownSignal; +use sequencer_core::api::{TxRequest, TxRequestError}; + +#[derive(Debug, Error, Clone)] +pub enum ApiError { + #[error("{0}")] + BadRequest(String), + #[error("{0}")] + PayloadTooLarge(String), + #[error("{0}")] + InvalidSignature(String), + #[error("{0}")] + ExecutionRejected(String), + #[error("{0}")] + Unavailable(String), + #[error("{0}")] + InternalError(String), + #[error("{0}")] + Overloaded(String), +} + +#[derive(Debug, Serialize)] +struct ErrorResponse { + ok: bool, + code: &'static str, + message: String, +} + +impl ApiError { + pub fn bad_request(message: impl Into) -> Self { + Self::BadRequest(message.into()) + } + + pub fn payload_too_large(message: impl Into) -> Self { + Self::PayloadTooLarge(message.into()) + } + + pub fn invalid_signature(message: impl Into) -> Self { + Self::InvalidSignature(message.into()) + } + + pub fn internal_error(message: impl Into) -> Self { + Self::InternalError(message.into()) + } + + pub fn unavailable(message: impl Into) -> Self { + Self::Unavailable(message.into()) + } + + pub fn overloaded(message: impl Into) -> Self { + Self::Overloaded(message.into()) + } + + pub fn status(&self) -> StatusCode { + match self { + Self::BadRequest(_) | Self::InvalidSignature(_) => StatusCode::BAD_REQUEST, + Self::PayloadTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, + Self::ExecutionRejected(_) => StatusCode::UNPROCESSABLE_ENTITY, + Self::Unavailable(_) => StatusCode::SERVICE_UNAVAILABLE, + Self::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR, + Self::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS, + } + } + + pub fn code(&self) -> &'static str { + match self { + Self::BadRequest(_) => "BAD_REQUEST", + Self::PayloadTooLarge(_) => "PAYLOAD_TOO_LARGE", + Self::InvalidSignature(_) => "INVALID_SIGNATURE", + Self::ExecutionRejected(_) => "EXECUTION_REJECTED", + Self::Unavailable(_) => "UNAVAILABLE", + Self::InternalError(_) => "INTERNAL_ERROR", + Self::Overloaded(_) => "OVERLOADED", + } + } +} + +impl From for ApiError { + fn from(value: SequencerError) -> Self { + match value { + SequencerError::Invalid(message) => Self::ExecutionRejected(message), + SequencerError::Unavailable(message) => Self::Unavailable(message), + SequencerError::Internal(message) => Self::InternalError(message), + } + } +} + +impl From for ApiError { + fn from(value: TxRequestError) -> Self { + match value { + TxRequestError::BadRequest(message) => Self::BadRequest(message), + TxRequestError::InvalidSignature(message) => Self::InvalidSignature(message), + } + } +} + +impl IntoResponse for ApiError { + fn into_response(self) -> Response { + let body = ErrorResponse { + ok: false, + code: self.code(), + message: self.to_string(), + }; + (self.status(), Json(body)).into_response() + } +} + +// ── HTTP server orchestration ──────────────────────────────────────────────── +// +// Combines ingress + egress routers into one axum::serve. The api split will +// replace this with per-side starts on different ports. + +const DEFAULT_WS_MAX_SUBSCRIBERS: usize = 64; +const DEFAULT_WS_MAX_CATCHUP_EVENTS: u64 = 50_000; +const DEFAULT_MAX_BODY_BYTES: usize = TxRequest::MAX_JSON_BYTES_RECOMMENDED; + +/// Reason returned in the WS Close frame when the subscriber's requested +/// `from_offset` is too old for the catch-up window to bridge. +pub const WS_CATCHUP_WINDOW_EXCEEDED_REASON: &str = "catch-up window exceeded"; + +pub type ApiServerTask = tokio::task::JoinHandle>; + +#[derive(Debug, Clone, Copy)] +pub struct ApiConfig { + pub max_body_bytes: usize, + pub ws_max_subscribers: usize, + pub ws_max_catchup_events: u64, +} + +impl Default for ApiConfig { + fn default() -> Self { + Self { + max_body_bytes: DEFAULT_MAX_BODY_BYTES, + ws_max_subscribers: DEFAULT_WS_MAX_SUBSCRIBERS, + ws_max_catchup_events: DEFAULT_WS_MAX_CATCHUP_EVENTS, + } + } +} + +#[allow(clippy::too_many_arguments)] +pub async fn start( + http_addr: impl tokio::net::ToSocketAddrs, + tx_sender: mpsc::Sender, + domain: Eip712Domain, + max_user_op_data_bytes: usize, + shutdown: ShutdownSignal, + tx_feed: L2TxFeed, + config: ApiConfig, +) -> io::Result { + let listener = tokio::net::TcpListener::bind(http_addr).await?; + Ok(start_on_listener( + listener, + tx_sender, + domain, + max_user_op_data_bytes, + shutdown, + tx_feed, + config, + )) +} + +#[allow(clippy::too_many_arguments)] +pub fn start_on_listener( + listener: tokio::net::TcpListener, + tx_sender: mpsc::Sender, + domain: Eip712Domain, + max_user_op_data_bytes: usize, + shutdown: ShutdownSignal, + tx_feed: L2TxFeed, + config: ApiConfig, +) -> ApiServerTask { + let health_state = Arc::new(crate::egress::api::HealthState { + tx_sender: tx_sender.clone(), + shutdown: shutdown.clone(), + }); + let submit_state = Arc::new(SubmitState::new( + tx_sender, + domain, + max_user_op_data_bytes, + shutdown.clone(), + )); + let subscribe_state = Arc::new(SubscribeState::new( + shutdown.clone(), + tx_feed, + config.ws_max_subscribers, + config.ws_max_catchup_events, + )); + + let app: Router = crate::ingress::api::router(submit_state) + .merge(crate::egress::api::router(subscribe_state, health_state)) + // Enforces a raw request-body cap before JSON deserialization, including whitespace. + .layer(DefaultBodyLimit::max(config.max_body_bytes)) + .layer(TraceLayer::new_for_http()); + + tokio::spawn(async move { + axum::serve(listener, app) + .with_graceful_shutdown(async move { + shutdown.wait_for_shutdown().await; + }) + .await + }) +} diff --git a/sequencer/src/inclusion_lane/config.rs b/sequencer/src/inclusion_lane/config.rs deleted file mode 100644 index fff90d8..0000000 --- a/sequencer/src/inclusion_lane/config.rs +++ /dev/null @@ -1,32 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::time::Duration; - -use alloy_primitives::Address; - -const DEFAULT_MAX_USER_OPS_PER_CHUNK: usize = 64; -const DEFAULT_SAFE_INPUT_BUFFER_CAPACITY: usize = 2048; -const DEFAULT_MAX_BATCH_OPEN: Duration = Duration::from_secs(2 * 60 * 60); -const DEFAULT_IDLE_POLL_INTERVAL: Duration = Duration::from_millis(10); - -#[derive(Debug, Clone, Copy)] -pub struct InclusionLaneConfig { - pub batch_submitter_address: Address, - pub max_user_ops_per_chunk: usize, - pub safe_input_buffer_capacity: usize, - pub max_batch_open: Duration, - pub idle_poll_interval: Duration, -} - -impl InclusionLaneConfig { - pub fn new(batch_submitter_address: Address) -> Self { - Self { - batch_submitter_address, - max_user_ops_per_chunk: DEFAULT_MAX_USER_OPS_PER_CHUNK, - safe_input_buffer_capacity: DEFAULT_SAFE_INPUT_BUFFER_CAPACITY, - max_batch_open: DEFAULT_MAX_BATCH_OPEN, - idle_poll_interval: DEFAULT_IDLE_POLL_INTERVAL, - } - } -} diff --git a/sequencer/src/inclusion_lane/lane.rs b/sequencer/src/inclusion_lane/lane.rs deleted file mode 100644 index 459833e..0000000 --- a/sequencer/src/inclusion_lane/lane.rs +++ /dev/null @@ -1,373 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::thread; -use std::time::SystemTime; - -use tokio::sync::mpsc; -use tokio::task::JoinHandle; - -use crate::shutdown::ShutdownSignal; -use crate::storage::{SafeInputRange, Storage, StoredSafeInput, WriteHead}; -use sequencer_core::application::{AppError, Application, ExecutionOutcome}; -use sequencer_core::l2_tx::DirectInput; -use sequencer_core::user_op::SignedUserOp; - -use super::catch_up::catch_up_application; -use super::config::InclusionLaneConfig; -use super::{InclusionLaneError, PendingUserOp, SequencerError}; - -pub struct InclusionLane { - rx: mpsc::Receiver, - shutdown: ShutdownSignal, - app: A, - storage: Storage, - config: InclusionLaneConfig, -} - -impl InclusionLane { - pub fn start( - queue_capacity: usize, - shutdown: ShutdownSignal, - app: A, - storage: Storage, - config: InclusionLaneConfig, - ) -> ( - mpsc::Sender, - JoinHandle>, - ) { - let (tx, rx) = mpsc::channel::(queue_capacity.max(1)); - let handle = tokio::task::spawn_blocking(move || { - let mut lane = Self { - rx, - shutdown, - app, - storage, - config, - }; - lane.run_forever() - }); - (tx, handle) - } - - fn run_forever(&mut self) -> Result<(), InclusionLaneError> { - self.run_catch_up()?; - let mut included = Vec::with_capacity(self.config.max_user_ops_per_chunk.max(1)); - let mut safe_inputs = Vec::with_capacity(self.config.safe_input_buffer_capacity.max(1)); - let mut lane_state = self.load_or_initialize_lane_state(&mut safe_inputs)?; - - loop { - if self.shutdown.is_shutdown_requested() { - self.reject_pending_user_ops_due_to_shutdown(); - return Ok(()); - } - - let advanced_safe_frontier = - self.maybe_advance_safe_frontier(&mut lane_state, &mut safe_inputs)?; - - let included_user_op_count = - self.process_user_op_chunk(&mut lane_state.head, &mut included)?; - - if should_close_batch::(&lane_state.head, &self.config) { - let next_safe_block = lane_state.head.safe_block; - self.close_frame_and_batch(&mut lane_state.head, next_safe_block)?; - } else if !advanced_safe_frontier && included_user_op_count == 0 { - thread::sleep(self.config.idle_poll_interval); - } - } - } - - fn run_catch_up(&mut self) -> Result<(), InclusionLaneError> { - catch_up_application( - &mut self.app, - &mut self.storage, - self.config.batch_submitter_address, - ) - .map_err(|source| InclusionLaneError::CatchUp { source }) - } - - fn load_or_initialize_lane_state( - &mut self, - safe_inputs: &mut Vec, - ) -> Result { - let next_safe_input_index = self - .storage - .load_next_undrained_safe_input_index() - .map_err(|source| InclusionLaneError::LoadNextUndrainedDirectInputIndex { source })?; - - let last_drained_direct_range = SafeInputRange::empty_at(next_safe_input_index); - if let Some(head) = self - .storage - .load_open_state() - .map_err(|source| InclusionLaneError::LoadOpenState { source })? - { - return Ok(LaneState { - last_drained_direct_range, - head, - }); - } - - let frontier = self - .storage - .load_safe_frontier() - .map_err(|source| InclusionLaneError::LoadSafeInputs { source })?; - assert!( - frontier.end_exclusive >= last_drained_direct_range.end_exclusive, - "safe-input head regressed during lane initialization: safe_end={}, next={}", - frontier.end_exclusive, - last_drained_direct_range.end_exclusive - ); - - let leading_direct_range = last_drained_direct_range.advance_to(frontier.end_exclusive); - self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; - let head = self - .storage - .initialize_open_state(frontier.safe_block, leading_direct_range) - .map_err(|source| InclusionLaneError::LoadOpenState { source })?; - - Ok(LaneState { - last_drained_direct_range: leading_direct_range, - head, - }) - } - - fn process_user_op_chunk( - &mut self, - head: &mut WriteHead, - included: &mut Vec, - ) -> Result { - included.clear(); - dequeue_and_execute_user_op_chunk( - &mut self.rx, - &mut self.app, - head.frame_fee, - self.config.max_user_ops_per_chunk.max(1), - included, - )?; - let included_count = included.len(); - - self.persist_included_user_ops(head, included)?; - - for item in included.drain(..) { - let _ = item.respond_to.send(Ok(())); - } - - Ok(included_count) - } - - fn maybe_advance_safe_frontier( - &mut self, - lane_state: &mut LaneState, - safe_inputs: &mut Vec, - ) -> Result { - let frontier = self - .storage - .load_safe_frontier() - .map_err(|source| InclusionLaneError::LoadSafeInputs { source })?; - assert!( - frontier.end_exclusive >= lane_state.last_drained_direct_range.end_exclusive, - "safe-input head regressed: safe_end={}, next={}", - frontier.end_exclusive, - lane_state.last_drained_direct_range.end_exclusive - ); - if frontier.safe_block <= lane_state.head.safe_block { - return Ok(false); - } - - let leading_direct_range = lane_state - .last_drained_direct_range - .advance_to(frontier.end_exclusive); - self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; - self.close_frame_only( - &mut lane_state.head, - frontier.safe_block, - leading_direct_range, - )?; - lane_state.last_drained_direct_range = leading_direct_range; - Ok(true) - } - - fn persist_included_user_ops( - &mut self, - head: &mut WriteHead, - included: &mut Vec, - ) -> Result<(), InclusionLaneError> { - self.storage - .append_user_ops_chunk(head, included.as_slice()) - .map_err(|source| { - Self::respond_internal_to_all(included, format!("db error: {source}")); - InclusionLaneError::AppendUserOps { source } - }) - } - - fn execute_safe_inputs_range( - &mut self, - direct_range: SafeInputRange, - chunk: &mut Vec, - ) -> Result { - let max_chunk_len = self.config.safe_input_buffer_capacity.max(1) as u64; - let mut chunk_start = direct_range.start_inclusive; - while chunk_start < direct_range.end_exclusive { - let chunk_end_exclusive = direct_range - .end_exclusive - .min(chunk_start.saturating_add(max_chunk_len)); - self.load_safe_inputs_chunk(chunk_start, chunk_end_exclusive, chunk)?; - self.execute_safe_inputs_chunk(chunk.as_slice())?; - chunk_start = chunk_end_exclusive; - } - - Ok(direct_range) - } - - fn close_frame_and_batch( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - ) -> Result<(), InclusionLaneError> { - self.storage - .close_frame_and_batch(head, next_safe_block) - .map_err(|source| InclusionLaneError::CloseFrameRotate { source }) - } - - fn close_frame_only( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - leading_direct_range: SafeInputRange, - ) -> Result<(), InclusionLaneError> { - self.storage - .close_frame_only(head, next_safe_block, leading_direct_range) - .map_err(|source| InclusionLaneError::CloseFrameRotate { source }) - } - - fn load_safe_inputs_chunk( - &mut self, - start_inclusive: u64, - end_exclusive: u64, - chunk: &mut Vec, - ) -> Result<(), InclusionLaneError> { - chunk.clear(); - self.storage - .fill_safe_inputs(start_inclusive, end_exclusive, chunk) - .map_err(|source| InclusionLaneError::LoadSafeInputs { source }) - } - - fn execute_safe_inputs_chunk( - &mut self, - chunk: &[StoredSafeInput], - ) -> Result<(), InclusionLaneError> { - for input in chunk { - if input.sender == self.config.batch_submitter_address { - continue; - } - let direct_input = DirectInput { - sender: input.sender, - block_number: input.block_number, - payload: input.payload.clone(), - }; - - self.app - .execute_direct_input(&direct_input) - .map_err(|source| InclusionLaneError::ExecuteDirectInput { source })?; - } - Ok(()) - } - - fn respond_internal_to_all(pending: &mut Vec, message: String) { - for item in pending.drain(..) { - let _ = item - .respond_to - .send(Err(SequencerError::internal(message.clone()))); - } - } - - fn reject_pending_user_ops_due_to_shutdown(&mut self) { - loop { - match self.rx.try_recv() { - Ok(item) => { - let _ = item - .respond_to - .send(Err(SequencerError::unavailable("sequencer shutting down"))); - } - Err(mpsc::error::TryRecvError::Empty) - | Err(mpsc::error::TryRecvError::Disconnected) => return, - } - } - } -} - -fn should_close_batch(head: &WriteHead, config: &InclusionLaneConfig) -> bool { - should_close_batch_by_time(head, config) || should_close_batch_by_size::(head) -} - -fn should_close_batch_by_time(head: &WriteHead, config: &InclusionLaneConfig) -> bool { - let age = SystemTime::now() - .duration_since(head.batch_created_at) - .unwrap_or_default(); - age >= config.max_batch_open -} - -fn should_close_batch_by_size(head: &WriteHead) -> bool { - user_op_count_to_bytes::(head.batch_user_op_count) >= head.max_batch_user_op_bytes -} - -fn execute_user_op( - app: &mut impl Application, - item: PendingUserOp, - current_frame_fee: u16, - included: &mut Vec, -) { - match app.validate_and_execute_user_op( - item.signed.sender, - &item.signed.user_op, - current_frame_fee, - ) { - Ok(ExecutionOutcome::Included { .. }) => included.push(item), - Ok(ExecutionOutcome::Invalid(reason)) => { - let _ = item - .respond_to - .send(Err(SequencerError::invalid(reason.to_string()))); - } - Err(AppError::Internal { reason }) => { - let _ = item.respond_to.send(Err(SequencerError::internal(reason))); - } - } -} - -pub(super) fn dequeue_and_execute_user_op_chunk( - rx: &mut mpsc::Receiver, - app: &mut impl Application, - current_frame_fee: u16, - max_chunk: usize, - included: &mut Vec, -) -> Result<(), InclusionLaneError> { - let mut executed_user_ops = 0_usize; - - while executed_user_ops < max_chunk { - match rx.try_recv() { - Ok(item) => { - execute_user_op(app, item, current_frame_fee, included); - executed_user_ops = executed_user_ops.saturating_add(1); - } - Err(mpsc::error::TryRecvError::Empty) => return Ok(()), - Err(mpsc::error::TryRecvError::Disconnected) => { - if executed_user_ops == 0 { - return Err(InclusionLaneError::ChannelClosed); - } - return Ok(()); - } - } - } - - Ok(()) -} - -fn user_op_count_to_bytes(user_op_count: u64) -> u64 { - let one_user_op_bytes = SignedUserOp::max_batch_metadata() + A::MAX_METHOD_PAYLOAD_BYTES; - user_op_count.saturating_mul(one_user_op_bytes as u64) -} - -struct LaneState { - last_drained_direct_range: SafeInputRange, - head: WriteHead, -} diff --git a/sequencer/src/inclusion_lane/mod.rs b/sequencer/src/inclusion_lane/mod.rs deleted file mode 100644 index 7e52786..0000000 --- a/sequencer/src/inclusion_lane/mod.rs +++ /dev/null @@ -1,16 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -mod catch_up; -mod config; -mod error; -mod lane; -mod types; - -pub use config::InclusionLaneConfig; -pub use error::InclusionLaneError; -pub use lane::InclusionLane; -pub use types::{PendingUserOp, SequencerError}; - -#[cfg(test)] -mod tests; diff --git a/sequencer/src/ingress/api.rs b/sequencer/src/ingress/api.rs new file mode 100644 index 0000000..9ed754b --- /dev/null +++ b/sequencer/src/ingress/api.rs @@ -0,0 +1,285 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! `POST /tx` — validate a signed user op, enqueue it for the inclusion lane, +//! and wait for the lane's commit ack before responding. Synchronous from the +//! client's perspective: 200 means included. + +use std::sync::Arc; +use std::time::SystemTime; + +use alloy_sol_types::Eip712Domain; +use axum::Router; +use axum::extract::{Json, State}; +use axum::http::StatusCode; +use axum::routing::post; +use tokio::sync::mpsc::{self, error::TrySendError}; +use tokio::sync::oneshot; +use tracing::debug; + +use crate::http::ApiError; +use crate::ingress::inclusion_lane::PendingUserOp; +use crate::runtime::shutdown::ShutdownSignal; +use sequencer_core::api::{TxRequest, TxResponse}; +use sequencer_core::user_op::SignedUserOp; + +/// State for the submit endpoint. Kept narrow — only what `/tx` actually needs. +#[derive(Clone)] +pub(crate) struct SubmitState { + pub tx_sender: mpsc::Sender, + pub domain: Eip712Domain, + pub max_user_op_data_bytes: usize, + pub shutdown: ShutdownSignal, +} + +impl SubmitState { + pub(crate) fn new( + tx_sender: mpsc::Sender, + domain: Eip712Domain, + max_user_op_data_bytes: usize, + shutdown: ShutdownSignal, + ) -> Self { + Self { + tx_sender, + domain, + max_user_op_data_bytes, + shutdown, + } + } + + fn reject_if_shutting_down(&self) -> Result<(), ApiError> { + if self.shutdown.is_shutdown_requested() { + Err(ApiError::unavailable("sequencer shutting down")) + } else { + Ok(()) + } + } +} + +/// Build the ingress router. Caller wires it into an `axum::serve` listener. +pub(crate) fn router(state: Arc) -> Router { + Router::new() + .route("/tx", post(submit_tx)) + .with_state(state) +} + +async fn submit_tx( + State(state): State>, + req: Result, axum::extract::rejection::JsonRejection>, +) -> Result, ApiError> { + let Json(req) = req.map_err(map_json_rejection)?; + + let signed = req + .into_signed_user_op(&state.domain, state.max_user_op_data_bytes) + .map_err(ApiError::from)?; + let nonce = signed.user_op.nonce; + let sender = signed.sender; + let ack = enqueue_verified_tx(state.as_ref(), signed)?; + + let commit_result = ack + .await + .map_err(|_| ApiError::internal_error("inclusion lane dropped response"))?; + commit_result.map_err(ApiError::from)?; + debug!(sender = %sender, nonce, "tx committed"); + + Ok(Json(TxResponse { + ok: true, + sender: sender.to_string(), + nonce, + })) +} + +/// Normalize JSON-extractor failures into fixed client-facing messages. +/// Keeps the public API contract stable across axum upgrades and avoids +/// reflecting parser internals (serde line/column, token excerpts) to callers. +fn map_json_rejection(err: axum::extract::rejection::JsonRejection) -> ApiError { + use axum::extract::rejection::JsonRejection; + + tracing::debug!(error = %err, "JSON extraction failed"); + + if err.status() == StatusCode::PAYLOAD_TOO_LARGE { + ApiError::payload_too_large("request body too large") + } else { + match err { + JsonRejection::MissingJsonContentType(_) => { + ApiError::bad_request("missing content type") + } + _ => ApiError::bad_request("invalid JSON"), + } + } +} + +fn enqueue_verified_tx( + state: &SubmitState, + signed: SignedUserOp, +) -> Result>, ApiError> +{ + state.reject_if_shutting_down()?; + + let (respond_to, recv) = oneshot::channel(); + let pending = PendingUserOp { + signed, + respond_to, + received_at: SystemTime::now(), + }; + + match state.tx_sender.try_send(pending) { + Ok(()) => Ok(recv), + Err(TrySendError::Full(_)) => Err(ApiError::overloaded("queue full")), + Err(TrySendError::Closed(_)) => Err(ApiError::internal_error("inclusion lane unavailable")), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use alloy_primitives::{Address, Signature}; + use alloy_sol_types::Eip712Domain; + use alloy_sol_types::SolStruct; + use axum::http::StatusCode; + use k256::ecdsa::SigningKey; + use k256::ecdsa::signature::hazmat::PrehashSigner; + use std::sync::Arc; + use tempfile::TempDir; + use tokio::sync::mpsc; + + use crate::storage::Storage; + use sequencer_core::user_op::UserOp; + + #[tokio::test(flavor = "current_thread")] + async fn submit_tx_rejects_when_shutdown_has_started() { + let db = TempDir::new().expect("create temp dir"); + let db_path = db.path().join("sequencer.db"); + let _storage = Storage::open(&db_path.to_string_lossy()).expect("create db"); + let shutdown = ShutdownSignal::default(); + shutdown.request_shutdown(); + + let (tx_sender, _rx) = mpsc::channel::(1); + let state = Arc::new(SubmitState::new( + tx_sender, + Eip712Domain { + name: None, + version: None, + chain_id: None, + verifying_contract: None, + salt: None, + }, + 128, + shutdown, + )); + + let signing_key = SigningKey::from_bytes((&[7_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + let user_op = UserOp { + nonce: 0, + max_fee: 0, + data: Vec::new().into(), + }; + let request = TxRequest { + message: user_op.clone(), + signature: sign_user_op_hex(&state.domain, &user_op, &signing_key), + sender: sender.to_string(), + }; + + let result = submit_tx(State(state), Ok(Json(request))).await; + + let err = result.expect_err("submit should be rejected during shutdown"); + assert_eq!(err.status(), StatusCode::SERVICE_UNAVAILABLE); + assert_eq!(err.code(), "UNAVAILABLE"); + } + + fn sign_user_op_hex( + domain: &Eip712Domain, + user_op: &UserOp, + signing_key: &SigningKey, + ) -> String { + let hash = user_op.eip712_signing_hash(domain); + let k256_sig = signing_key + .sign_prehash(hash.as_slice()) + .expect("sign user op hash"); + + let sender = address_from_signing_key(signing_key); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|candidate| { + candidate + .recover_address_from_prehash(&hash) + .ok() + .map(|value| value == sender) + .unwrap_or(false) + }) + .expect("recoverable parity for signature"); + + alloy_primitives::hex::encode_prefixed(signature.as_bytes()) + } + + fn address_from_signing_key(signing_key: &SigningKey) -> Address { + let verifying = signing_key.verifying_key().to_encoded_point(false); + Address::from_raw_public_key(&verifying.as_bytes()[1..]) + } + + // ── S-malleability — no alternate signature can recover a different + // address at our boundary. Structurally guaranteed by alloy+k256; this is + // a regression lock. + + #[test] + fn s_malleable_signature_cannot_recover_a_different_address() { + use alloy_primitives::{B256, U256}; + + // secp256k1 curve order `n`. s' = n - s is the canonical malleable + // transform that pairs with flipped parity to produce an alternate + // signature recovering the same public key. + const SECP256K1_N: U256 = U256::from_be_slice(&[ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFE, 0xBA, 0xAE, 0xDC, 0xE6, 0xAF, 0x48, 0xA0, 0x3B, 0xBF, 0xD2, 0x5E, 0x8C, + 0xD0, 0x36, 0x41, 0x41, + ]); + + let signing_key = SigningKey::from_bytes((&[0x42_u8; 32]).into()).expect("key"); + let expected_sender = address_from_signing_key(&signing_key); + + let msg_hash = B256::from([0xfe_u8; 32]); + let k256_sig = signing_key + .sign_prehash(msg_hash.as_slice()) + .expect("sign prehash"); + + // k256's `sign_prehash` returns a low-s signature by default. Find the + // parity that pairs with it to recover the expected signer. + let valid_sig = [false, true] + .into_iter() + .map(|p| Signature::from_signature_and_parity(k256_sig, p)) + .find(|s| { + s.recover_address_from_prehash(&msg_hash) + .ok() + .is_some_and(|a| a == expected_sender) + }) + .expect("low-s signature must recover the signer with one parity"); + + // Construct the S-malleable variant: same r, s' = n - s, flipped parity. + let malleable_sig = + Signature::new(valid_sig.r(), SECP256K1_N - valid_sig.s(), !valid_sig.v()); + assert_ne!( + malleable_sig.s(), + valid_sig.s(), + "malleable transform must actually change the signature", + ); + + match malleable_sig.recover_address_from_prehash(&msg_hash) { + Err(_) => { + // alloy rejected the high-s form (EIP-2 style). Impersonation + // via malleability is structurally impossible at recovery. + } + Ok(addr) => { + // alloy accepted high-s; it MUST return the same signer. + // Any other outcome would let an attacker grind a distinct + // signature that recovers a different address. + assert_eq!( + addr, expected_sender, + "malleable signature recovered a DIFFERENT address — impersonation possible", + ); + } + } + } +} diff --git a/sequencer/src/inclusion_lane/catch_up.rs b/sequencer/src/ingress/inclusion_lane/catch_up.rs similarity index 79% rename from sequencer/src/inclusion_lane/catch_up.rs rename to sequencer/src/ingress/inclusion_lane/catch_up.rs index 8515b34..b01cff0 100644 --- a/sequencer/src/inclusion_lane/catch_up.rs +++ b/sequencer/src/ingress/inclusion_lane/catch_up.rs @@ -1,6 +1,10 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Startup-only replay: walk the persisted ordered-L2-tx stream and feed it +//! to the application so its in-memory state matches the DB before the lane +//! starts taking new work. Runs once, before the hot loop. + use alloy_primitives::Address; use crate::storage::Storage; @@ -30,12 +34,14 @@ pub(super) fn catch_up_application_paged( batch_submitter_address: Address, page_size: usize, ) -> Result<(), CatchUpError> { - let mut next_offset = 0; + // Cursor tracks the DB offset of the last processed item. + // SQLite rowids start at 1, so 0 means "before all items". + let mut next_offset: u64 = 0; let page_size = page_size.max(1); loop { let replay = storage - .load_ordered_l2_txs_page_from(next_offset, page_size) + .ordered_l2_txs_page_from(next_offset, page_size) .map_err(|source| CatchUpError::LoadReplay { offset: next_offset, source, @@ -45,9 +51,9 @@ pub(super) fn catch_up_application_paged( return Ok(()); } - for item in replay { + for (db_offset, item) in replay { replay_sequenced_l2_tx(app, batch_submitter_address, item)?; - next_offset = next_offset.saturating_add(1); + next_offset = db_offset; } } } diff --git a/sequencer/src/ingress/inclusion_lane/config.rs b/sequencer/src/ingress/inclusion_lane/config.rs new file mode 100644 index 0000000..1bc9f69 --- /dev/null +++ b/sequencer/src/ingress/inclusion_lane/config.rs @@ -0,0 +1,52 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Runtime knobs for the inclusion lane. Defaults tuned for low-latency +//! Ethereum L1 deployment; tests override individual fields directly. + +use std::time::Duration; + +use alloy_primitives::Address; + +const DEFAULT_MAX_USER_OPS_PER_CHUNK: usize = 64; +const DEFAULT_SAFE_INPUT_BUFFER_CAPACITY: usize = 2048; +const DEFAULT_MAX_BATCH_OPEN: Duration = Duration::from_secs(2 * 60 * 60); +const DEFAULT_IDLE_POLL_INTERVAL: Duration = Duration::from_millis(10); +/// Minimum gap between L1 safe-frontier polls. Bounds the SQL load when the +/// lane is otherwise idle. L1 safe head advances at ~12s cadence, so 1s is +/// well inside the responsiveness budget. +const DEFAULT_FRONTIER_MIN_INTERVAL: Duration = Duration::from_secs(1); + +#[derive(Debug, Clone, Copy)] +pub struct InclusionLaneConfig { + /// Address of the batch submitter wallet. Direct inputs from this sender + /// are skipped during application execution (they're our own batch + /// submissions; the application doesn't apply them as user-level inputs). + pub batch_submitter_address: Address, + /// Cap on user ops dequeued per chunk. Bounds per-chunk SQL transaction + /// size and (more importantly) ack latency for the first op in each chunk. + pub max_user_ops_per_chunk: usize, + /// Reusable buffer size for safe-input loading. Doesn't bound work; just + /// the memory ceiling for the read-and-execute scratch buffer. + pub safe_input_buffer_capacity: usize, + /// Force a batch close after this much wall time, regardless of size. + pub max_batch_open: Duration, + /// Sleep duration when the lane has nothing to do (no queue, no advance). + pub idle_poll_interval: Duration, + /// Minimum gap between L1 safe-frontier polls. Bounds idle SQL load. See + /// `DEFAULT_FRONTIER_MIN_INTERVAL` for the rationale on the default. + pub frontier_min_interval: Duration, +} + +impl InclusionLaneConfig { + pub fn new(batch_submitter_address: Address) -> Self { + Self { + batch_submitter_address, + max_user_ops_per_chunk: DEFAULT_MAX_USER_OPS_PER_CHUNK, + safe_input_buffer_capacity: DEFAULT_SAFE_INPUT_BUFFER_CAPACITY, + max_batch_open: DEFAULT_MAX_BATCH_OPEN, + idle_poll_interval: DEFAULT_IDLE_POLL_INTERVAL, + frontier_min_interval: DEFAULT_FRONTIER_MIN_INTERVAL, + } + } +} diff --git a/sequencer/src/inclusion_lane/error.rs b/sequencer/src/ingress/inclusion_lane/error.rs similarity index 60% rename from sequencer/src/inclusion_lane/error.rs rename to sequencer/src/ingress/inclusion_lane/error.rs index 03333db..7849c75 100644 --- a/sequencer/src/inclusion_lane/error.rs +++ b/sequencer/src/ingress/inclusion_lane/error.rs @@ -1,6 +1,9 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Lane-level error types. Returned from the lane's join handle; the runtime +//! logs them and may shut down depending on severity. + use sequencer_core::application::AppError; use thiserror::Error; @@ -13,36 +16,18 @@ pub enum InclusionLaneError { #[source] source: CatchUpError, }, - #[error("cannot load next undrained safe-input index")] - LoadNextUndrainedDirectInputIndex { - #[source] - source: rusqlite::Error, - }, - #[error("cannot load safe inputs")] - LoadSafeInputs { + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("user op execution failed")] + ExecuteUserOp { #[source] - source: rusqlite::Error, - }, - #[error("cannot load/create open batch/frame")] - LoadOpenState { - #[source] - source: rusqlite::Error, - }, - #[error("append user ops failed")] - AppendUserOps { - #[source] - source: rusqlite::Error, + source: AppError, }, #[error("direct input execution failed")] ExecuteDirectInput { #[source] source: AppError, }, - #[error("failed to close/rotate frame")] - CloseFrameRotate { - #[source] - source: rusqlite::Error, - }, } #[derive(Debug, Error)] diff --git a/sequencer/src/ingress/inclusion_lane/mod.rs b/sequencer/src/ingress/inclusion_lane/mod.rs new file mode 100644 index 0000000..022376c --- /dev/null +++ b/sequencer/src/ingress/inclusion_lane/mod.rs @@ -0,0 +1,455 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Hot-path loop. The lane runs three layers of amortization on each iteration: +//! +//! - **Frontier check** (time-gated by `frontier_min_interval`): polls L1's +//! safe head; advances frame boundary if it moved. +//! - **Inner drain loop** (`run_inner_drain`): processes user-op chunks until +//! the queue empties or the batch hits its size target. +//! - **Per-chunk persistence** (`max_user_ops_per_chunk`): each chunk commits +//! in one SQL transaction, bounding ack latency for the first op in it. +//! +//! The lane is a single-thread `spawn_blocking` task. SQLite is the only +//! synchronization with other components (input reader, batch submitter). + +mod catch_up; +mod config; +mod error; +mod types; + +#[cfg(test)] +mod tests; + +pub use config::InclusionLaneConfig; +pub use error::InclusionLaneError; +pub use types::{PendingUserOp, SequencerError}; + +use std::thread; +use std::time::{Duration, Instant, SystemTime}; + +use tokio::sync::mpsc; +use tokio::task::JoinHandle; + +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput, WriteHead}; +use sequencer_core::application::{AppError, Application, ExecutionOutcome}; +use sequencer_core::l2_tx::DirectInput; +use sequencer_core::user_op::SignedUserOp; + +use catch_up::catch_up_application; + +/// Owns the application instance, the `Storage` write handle, and the user-op +/// receiver for the lifetime of the sequencer process. +pub struct InclusionLane { + rx: mpsc::Receiver, + shutdown: ShutdownSignal, + app: A, + storage: Storage, + config: InclusionLaneConfig, +} + +impl InclusionLane { + /// Spawn the lane on a blocking thread. Returns the input MPSC sender (for + /// the API to enqueue user ops) and the join handle (for the runtime to + /// observe lane shutdown). + /// + /// The handle resolves to `Ok(())` on graceful shutdown, or an + /// `InclusionLaneError` if the lane crashed. + pub fn start( + queue_capacity: usize, + shutdown: ShutdownSignal, + app: A, + storage: Storage, + config: InclusionLaneConfig, + ) -> ( + mpsc::Sender, + JoinHandle>, + ) { + let (tx, rx) = mpsc::channel::(queue_capacity.max(1)); + let handle = tokio::task::spawn_blocking(move || { + let mut lane = Self { + rx, + shutdown, + app, + storage, + config, + }; + lane.run_forever() + }); + (tx, handle) + } + + fn run_forever(&mut self) -> Result<(), InclusionLaneError> { + self.run_catch_up()?; + let mut included = Vec::with_capacity(self.config.max_user_ops_per_chunk.max(1)); + let mut safe_inputs = Vec::with_capacity(self.config.safe_input_buffer_capacity.max(1)); + let mut lane_state = self.load_or_initialize_lane_state(&mut safe_inputs)?; + + loop { + if self.shutdown.is_shutdown_requested() { + self.reject_pending_user_ops_due_to_shutdown(); + return Ok(()); + } + + self.maybe_advance_safe_frontier(&mut lane_state, &mut safe_inputs)?; + let drain = self.run_inner_drain(&mut lane_state.head, &mut included)?; + + if drain.hit_batch_target() + || should_close_batch_by_time(&lane_state.head, &self.config) + { + let next_safe_block = lane_state.head.safe_block; + self.storage + .close_frame_and_batch(&mut lane_state.head, next_safe_block)?; + } else if !drain.drained_any() { + thread::sleep(self.config.idle_poll_interval); + } + } + } + + fn run_catch_up(&mut self) -> Result<(), InclusionLaneError> { + catch_up_application( + &mut self.app, + &mut self.storage, + self.config.batch_submitter_address, + ) + .map_err(|source| InclusionLaneError::CatchUp { source }) + } + + fn load_or_initialize_lane_state( + &mut self, + safe_inputs: &mut Vec, + ) -> Result { + let next_safe_input_index = self.storage.next_undrained_safe_input_index()?; + + let last_drained_direct_range = SafeInputRange::empty_at(next_safe_input_index); + if let Some(head) = self.storage.open_state()? { + return Ok(LaneState::new(last_drained_direct_range, head)); + } + + let frontier = self.storage.safe_input_frontier()?; + assert!( + frontier.end_exclusive >= last_drained_direct_range.end(), + "safe-input head regressed during lane initialization: safe_end={}, next={}", + frontier.end_exclusive, + last_drained_direct_range.end() + ); + + let leading_direct_range = last_drained_direct_range.advance_to(frontier.end_exclusive); + self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; + let head = self + .storage + .initialize_open_state(frontier.safe_block, leading_direct_range)?; + + Ok(LaneState::new(leading_direct_range, head)) + } + + /// Drain user ops in chunks until the queue empties or we cross the batch + /// size target. Each chunk persists separately so ack latency stays bounded + /// by `max_user_ops_per_chunk`. + fn run_inner_drain( + &mut self, + head: &mut WriteHead, + included: &mut Vec, + ) -> Result { + let mut drained_any = false; + loop { + let (count, outcome) = self.process_user_op_chunk(head, included)?; + if count > 0 { + drained_any = true; + } + match outcome { + ChunkOutcome::QueueEmpty => { + return Ok(if drained_any { + DrainSummary::DrainedQueue + } else { + DrainSummary::Idle + }); + } + ChunkOutcome::HitBatchTarget => return Ok(DrainSummary::HitBatchTarget), + ChunkOutcome::MoreToProcess => continue, + } + } + } + + fn process_user_op_chunk( + &mut self, + head: &mut WriteHead, + included: &mut Vec, + ) -> Result<(usize, ChunkOutcome), InclusionLaneError> { + included.clear(); + let outcome = match dequeue_and_execute_user_op_chunk::( + &mut self.rx, + &mut self.app, + head.frame_fee, + self.config.max_user_ops_per_chunk.max(1), + head, + included, + ) { + Ok(outcome) => outcome, + Err(err) => { + Self::respond_internal_to_all(included, "application internal error".to_string()); + return Err(err); + } + }; + let included_count = included.len(); + + self.persist_included_user_ops(head, included)?; + + for item in included.drain(..) { + let _ = item.respond_to.send(Ok(())); + } + + Ok((included_count, outcome)) + } + + /// Time-gated to bound idle SQL load. High-throughput batches can delay + /// this past the gate, but a full batch is far less than 1s of work in + /// practice. + fn maybe_advance_safe_frontier( + &mut self, + lane_state: &mut LaneState, + safe_inputs: &mut Vec, + ) -> Result<(), InclusionLaneError> { + if !lane_state.frontier_check_due(self.config.frontier_min_interval) { + return Ok(()); + } + lane_state.mark_frontier_checked(); + + let frontier = self.storage.safe_input_frontier()?; + assert!( + frontier.end_exclusive >= lane_state.last_drained_direct_range.end(), + "safe-input head regressed: safe_end={}, next={}", + frontier.end_exclusive, + lane_state.last_drained_direct_range.end() + ); + if frontier.safe_block <= lane_state.head.safe_block { + return Ok(()); + } + + let leading_direct_range = lane_state + .last_drained_direct_range + .advance_to(frontier.end_exclusive); + self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; + self.storage.close_frame_only( + &mut lane_state.head, + frontier.safe_block, + leading_direct_range, + )?; + lane_state.last_drained_direct_range = leading_direct_range; + Ok(()) + } + + fn persist_included_user_ops( + &mut self, + head: &mut WriteHead, + included: &mut Vec, + ) -> Result<(), InclusionLaneError> { + self.storage + .append_user_ops_chunk(head, included.as_slice()) + .map_err(|err| { + Self::respond_internal_to_all(included, "internal storage error".to_string()); + InclusionLaneError::Storage(err) + }) + } + + fn execute_safe_inputs_range( + &mut self, + direct_range: SafeInputRange, + chunk: &mut Vec, + ) -> Result<(), InclusionLaneError> { + let max_chunk_len = self.config.safe_input_buffer_capacity.max(1) as u64; + for chunk_range in direct_range.chunks(max_chunk_len) { + self.storage.fill_safe_inputs(chunk_range, chunk)?; + self.execute_safe_inputs_chunk(chunk.as_slice())?; + } + Ok(()) + } + + fn execute_safe_inputs_chunk( + &mut self, + chunk: &[StoredSafeInput], + ) -> Result<(), InclusionLaneError> { + for input in chunk { + if input.sender == self.config.batch_submitter_address { + continue; + } + let direct_input = DirectInput { + sender: input.sender, + block_number: input.block_number, + payload: input.payload.clone(), + }; + + self.app + .execute_direct_input(&direct_input) + .map_err(|source| InclusionLaneError::ExecuteDirectInput { source })?; + } + Ok(()) + } + + fn respond_internal_to_all(pending: &mut Vec, message: String) { + for item in pending.drain(..) { + let _ = item + .respond_to + .send(Err(SequencerError::internal(message.clone()))); + } + } + + fn reject_pending_user_ops_due_to_shutdown(&mut self) { + while let Ok(item) = self.rx.try_recv() { + let _ = item + .respond_to + .send(Err(SequencerError::unavailable("sequencer shutting down"))); + } + } +} + +#[derive(Debug, PartialEq, Eq)] +enum DrainSummary { + /// Queue was empty; nothing was drained this pass. + Idle, + /// Drained the queue, no batch close needed (size-wise). + DrainedQueue, + /// Drained at least one op AND crossed the batch size target. + /// (`(false, true)` is unreachable: the size check fires only after a + /// successful execution, so `HitBatchTarget` always implies `drained_any`.) + HitBatchTarget, +} + +impl DrainSummary { + fn hit_batch_target(&self) -> bool { + matches!(self, Self::HitBatchTarget) + } + + fn drained_any(&self) -> bool { + !matches!(self, Self::Idle) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(super) enum ChunkOutcome { + /// Queue drained or sender disconnected with at least one op processed. + QueueEmpty, + /// Including the latest op pushed the batch over `max_batch_user_op_bytes`. + HitBatchTarget, + /// Hit `max_user_ops_per_chunk` cap; queue may still have more. + MoreToProcess, +} + +fn should_close_batch_by_time(head: &WriteHead, config: &InclusionLaneConfig) -> bool { + let age = SystemTime::now() + .duration_since(head.batch_created_at) + .unwrap_or_default(); + age >= config.max_batch_open +} + +fn execute_user_op( + app: &mut impl Application, + item: PendingUserOp, + current_frame_fee: u16, + included: &mut Vec, +) -> Result<(), InclusionLaneError> { + match app.validate_and_execute_user_op( + item.signed.sender, + &item.signed.user_op, + current_frame_fee, + ) { + Ok(ExecutionOutcome::Included { .. }) => included.push(item), + Ok(ExecutionOutcome::Invalid(reason)) => { + let _ = item + .respond_to + .send(Err(SequencerError::invalid(reason.to_string()))); + } + Err(AppError::Internal { reason }) => { + let _ = item + .respond_to + .send(Err(SequencerError::internal(reason.clone()))); + return Err(InclusionLaneError::ExecuteUserOp { + source: AppError::Internal { reason }, + }); + } + } + Ok(()) +} + +/// Dequeue and execute up to `max_chunk` user ops, stopping early if the batch +/// would cross its size target. Returns the outcome that drove the stop. +/// +/// `head.batch_user_op_count` reflects already-persisted ops; `included.len()` +/// is the count we'd add by persisting now. When their sum's bytes equal or +/// exceed `head.max_batch_user_op_bytes`, we stop and the caller closes the +/// batch. +pub(super) fn dequeue_and_execute_user_op_chunk( + rx: &mut mpsc::Receiver, + app: &mut A, + current_frame_fee: u16, + max_chunk: usize, + head: &WriteHead, + included: &mut Vec, +) -> Result { + let mut executed = 0_usize; + + while executed < max_chunk { + match rx.try_recv() { + Ok(item) => { + execute_user_op(app, item, current_frame_fee, included)?; + executed = executed.saturating_add(1); + + let projected = head + .batch_user_op_count + .saturating_add(included.len() as u64); + if user_op_count_to_bytes::(projected) >= head.max_batch_user_op_bytes { + return Ok(ChunkOutcome::HitBatchTarget); + } + } + Err(mpsc::error::TryRecvError::Empty) => return Ok(ChunkOutcome::QueueEmpty), + Err(mpsc::error::TryRecvError::Disconnected) => { + if executed == 0 { + return Err(InclusionLaneError::ChannelClosed); + } + return Ok(ChunkOutcome::QueueEmpty); + } + } + } + + Ok(ChunkOutcome::MoreToProcess) +} + +fn user_op_count_to_bytes(user_op_count: u64) -> u64 { + let one_user_op_bytes = SignedUserOp::max_batch_metadata() + A::MAX_METHOD_PAYLOAD_BYTES; + user_op_count.saturating_mul(one_user_op_bytes as u64) +} + +/// Lane-local state threaded through every loop iteration. +/// +/// `head` and `last_drained_direct_range` stay in lockstep — every safe-frontier +/// advance updates both `head.safe_block` (persisted in the open frame) and +/// `last_drained_direct_range.end()` (in-memory drain cursor). +/// +/// `last_frontier_check` is the time gate's bookkeeping; `None` initially so +/// the first iteration always polls. +struct LaneState { + last_drained_direct_range: SafeInputRange, + head: WriteHead, + last_frontier_check: Option, +} + +impl LaneState { + fn new(last_drained_direct_range: SafeInputRange, head: WriteHead) -> Self { + Self { + last_drained_direct_range, + head, + last_frontier_check: None, + } + } + + fn frontier_check_due(&self, min_interval: Duration) -> bool { + self.last_frontier_check + .map(|t| t.elapsed() >= min_interval) + .unwrap_or(true) + } + + fn mark_frontier_checked(&mut self) { + self.last_frontier_check = Some(Instant::now()); + } +} diff --git a/sequencer/src/inclusion_lane/tests.rs b/sequencer/src/ingress/inclusion_lane/tests.rs similarity index 79% rename from sequencer/src/inclusion_lane/tests.rs rename to sequencer/src/ingress/inclusion_lane/tests.rs index 4778dcf..3cab022 100644 --- a/sequencer/src/inclusion_lane/tests.rs +++ b/sequencer/src/ingress/inclusion_lane/tests.rs @@ -9,18 +9,18 @@ use std::time::{Duration, SystemTime}; use alloy_primitives::{Address, Signature, U256}; use app_core::application::MAX_METHOD_PAYLOAD_BYTES as WALLET_MAX_METHOD_PAYLOAD_BYTES; use rusqlite::params; -use tempfile::TempDir; use tokio::sync::{mpsc, oneshot}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::test_helpers::{SENDER_A, default_protocol_timing, temp_db}; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput, WriteHead}; use sequencer_core::application::{AppError, AppOutputs, Application, InvalidReason}; use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; use sequencer_core::user_op::{SignedUserOp, UserOp}; use super::catch_up::catch_up_application_paged; +use super::dequeue_and_execute_user_op_chunk; use super::error::CatchUpError; -use super::lane::dequeue_and_execute_user_op_chunk; use super::{InclusionLane, InclusionLaneConfig, InclusionLaneError, PendingUserOp}; #[derive(Default)] @@ -66,9 +66,33 @@ impl Application for TestApp { } } -struct TestDb { - _dir: TempDir, - path: String, +struct InternalUserOpApp; + +impl Application for InternalUserOpApp { + const MAX_METHOD_PAYLOAD_BYTES: usize = WALLET_MAX_METHOD_PAYLOAD_BYTES; + + fn current_user_nonce(&self, _sender: Address) -> u32 { + 0 + } + + fn current_user_balance(&self, _sender: Address) -> U256 { + U256::MAX + } + + fn validate_user_op( + &self, + _sender: Address, + _user_op: &UserOp, + _current_fee: u16, + ) -> Result<(), InvalidReason> { + Ok(()) + } + + fn execute_valid_user_op(&mut self, _user_op: &ValidUserOp) -> Result { + Err(AppError::Internal { + reason: "app invariant failed".to_string(), + }) + } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -182,18 +206,6 @@ impl Application for ReplayRecordingApp { } } -fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-inclusion-lane-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } -} - fn default_test_config() -> InclusionLaneConfig { InclusionLaneConfig { batch_submitter_address: Address::from_slice(&[0xff; 20]), @@ -201,6 +213,9 @@ fn default_test_config() -> InclusionLaneConfig { safe_input_buffer_capacity: 16, max_batch_open: Duration::MAX, idle_poll_interval: Duration::from_millis(2), + // Tests should observe frontier changes immediately rather than wait + // for the production gate. + frontier_min_interval: Duration::ZERO, } } @@ -212,16 +227,16 @@ async fn start_lane( ShutdownSignal, tokio::task::JoinHandle>, ) { - let storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); + storage + .append_safe_inputs(0, &[], SENDER_A, &default_protocol_timing()) + .expect("seed observed safe head"); let shutdown = ShutdownSignal::default(); let (tx, handle) = InclusionLane::start(128, shutdown.clone(), TestApp::default(), storage, config); let initialized = wait_until(Duration::from_secs(2), || { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); - storage - .load_open_state() - .expect("load open state") - .is_some() + let mut storage = Storage::open(db_path).expect("open storage"); + storage.open_state().expect("load open state").is_some() }) .await; assert!(initialized, "lane should initialize its first open state"); @@ -238,7 +253,9 @@ fn make_pending_user_op( let (respond_to, recv) = oneshot::channel(); let user_op = UserOp { nonce: 0, - max_fee: 1, + // Must be >= the DB default recommended_fee (1060) to pass the + // protocol-level max_fee >= fee_price check in the trait default. + max_fee: u16::MAX, data: vec![seed; 4].into(), }; ( @@ -256,7 +273,7 @@ fn make_pending_user_op( } fn seed_replay_fixture(db_path: &str) -> Vec { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage .initialize_open_state(0, SafeInputRange::empty_at(0)) .expect("initialize open state"); @@ -274,6 +291,8 @@ fn seed_replay_fixture(db_path: &str) -> Vec { payload: vec![0xaa], block_number: 10, }], + SENDER_A, + &default_protocol_timing(), ) .expect("append first direct input"); storage @@ -292,6 +311,8 @@ fn seed_replay_fixture(db_path: &str) -> Vec { payload: vec![0xbb], block_number: 20, }], + SENDER_A, + &default_protocol_timing(), ) .expect("append second direct input"); storage @@ -306,6 +327,8 @@ fn seed_replay_fixture(db_path: &str) -> Vec { payload: vec![0xcc], block_number: 30, }], + SENDER_A, + &default_protocol_timing(), ) .expect("append third direct input"); storage @@ -344,14 +367,14 @@ fn seed_replay_fixture(db_path: &str) -> Vec { } fn read_count(db_path: &str, table: &str) -> i64 { - let conn = Storage::open_connection(db_path, "NORMAL").expect("open sqlite reader"); + let conn = Storage::open_connection(db_path).expect("open sqlite reader"); let sql = format!("SELECT COUNT(*) FROM {table}"); conn.query_row(sql.as_str(), [], |row| row.get(0)) .expect("count rows") } fn read_frame_direct_count(db_path: &str, batch_index: i64, frame_in_batch: i64) -> i64 { - let conn = Storage::open_connection(db_path, "NORMAL").expect("open sqlite reader"); + let conn = Storage::open_connection(db_path).expect("open sqlite reader"); conn.query_row( "SELECT COUNT(*) FROM sequenced_l2_txs WHERE batch_index = ?1 @@ -413,8 +436,7 @@ async fn ack_happens_after_chunk_commit_without_closing_frame() { async fn direct_inputs_close_frame_and_persist_drain() { let db = temp_db("directs-close-frame"); let (_tx, shutdown, lane_handle) = start_lane(db.path.as_str(), default_test_config()).await; - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); feeder_storage .append_safe_inputs( @@ -424,6 +446,8 @@ async fn direct_inputs_close_frame_and_persist_drain() { payload: vec![0xaa], block_number: 10, }], + SENDER_A, + &default_protocol_timing(), ) .expect("append safe direct input"); @@ -443,9 +467,12 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { let db = temp_db("sequenced-safe-inputs-skip"); let batch_submitter_address = Address::from([0xfe; 20]); let executed_direct_inputs = Arc::new(AtomicU64::new(0)); - let storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .append_safe_inputs(0, &[], SENDER_A, &default_protocol_timing()) + .expect("seed observed safe head"); let shutdown = ShutdownSignal::default(); - let (tx, lane_handle) = InclusionLane::start( + let (_tx, lane_handle) = InclusionLane::start( 128, shutdown.clone(), SharedCountingApp { @@ -458,17 +485,13 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { }, ); let initialized = wait_until(Duration::from_secs(2), || { - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - storage - .load_open_state() - .expect("load open state") - .is_some() + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage.open_state().expect("load open state").is_some() }) .await; assert!(initialized, "lane should initialize open state"); - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); feeder_storage .append_safe_inputs( 10, @@ -477,6 +500,8 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { payload: vec![0xaa], block_number: 10, }], + SENDER_A, + &default_protocol_timing(), ) .expect("append safe batch-submitter input"); @@ -484,7 +509,6 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { read_frame_direct_count(db.path.as_str(), 0, 1) == 1 }) .await; - drop(tx); shutdown_lane(&shutdown, lane_handle).await; assert!( @@ -504,8 +528,7 @@ async fn direct_inputs_are_paginated_by_buffer_capacity() { let mut config = default_test_config(); config.safe_input_buffer_capacity = 2; let (_tx, shutdown, lane_handle) = start_lane(db.path.as_str(), config).await; - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); let mut directs = Vec::new(); for index in 0..5_u64 { @@ -516,7 +539,7 @@ async fn direct_inputs_are_paginated_by_buffer_capacity() { }); } feeder_storage - .append_safe_inputs(10, directs.as_slice()) + .append_safe_inputs(10, directs.as_slice(), SENDER_A, &default_protocol_timing()) .expect("append safe direct inputs"); let drained = wait_until(Duration::from_secs(2), || { @@ -534,8 +557,7 @@ async fn direct_inputs_are_paginated_by_buffer_capacity() { async fn safe_inputs_already_available_are_sequenced_before_later_user_ops() { let db = temp_db("directs-before-later-userops"); let (tx, shutdown, lane_handle) = start_lane(db.path.as_str(), default_test_config()).await; - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); feeder_storage .append_safe_inputs( @@ -545,6 +567,8 @@ async fn safe_inputs_already_available_are_sequenced_before_later_user_ops() { payload: vec![0xaa], block_number: 10, }], + SENDER_A, + &default_protocol_timing(), ) .expect("append safe direct input"); @@ -564,11 +588,14 @@ async fn safe_inputs_already_available_are_sequenced_before_later_user_ops() { .expect("wait for ack") .expect("ack channel open"); - let replay = { - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let replay: Vec = { + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); storage - .load_ordered_l2_txs_from(0) + .ordered_l2_txs_page_from(0, 1_000_000) .expect("load ordered replay") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() }; shutdown_lane(&shutdown, lane_handle).await; @@ -636,7 +663,7 @@ async fn batch_closes_when_max_user_op_bytes_is_reached() { // Set alpha high enough that batch_size_target ≤ one user op (126 bytes). // 55000*1000/(17000*26) = 124 bytes < 126. { - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); storage.set_alpha(17000, 1000).expect("set alpha"); } let config = default_test_config(); @@ -660,15 +687,32 @@ async fn batch_closes_when_max_user_op_bytes_is_reached() { assert_eq!(drain, 0); } +/// Test fixture: a `WriteHead` whose size budget is unbounded, so the early-stop +/// in `dequeue_and_execute_user_op_chunk` never triggers from the size check +/// alone. Tests that want to exercise the size check construct their own. +fn unbounded_head() -> WriteHead { + WriteHead { + batch_index: 0, + batch_created_at: SystemTime::now(), + frame_fee: 0, + safe_block: 0, + batch_user_op_count: 0, + open_frame_user_op_count: 0, + frame_in_batch: 0, + max_batch_user_op_bytes: u64::MAX, + } +} + #[test] fn dequeue_returns_channel_closed_when_disconnected() { let (tx, mut rx) = mpsc::channel::(1); drop(tx); let mut app = TestApp::default(); let mut included = Vec::new(); + let head = unbounded_head(); - let err = - dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 1, &mut included).unwrap_err(); + let err = dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 1, &head, &mut included) + .unwrap_err(); assert!(matches!(err, InclusionLaneError::ChannelClosed)); } @@ -681,16 +725,44 @@ fn dequeue_flushes_executed_ops_before_observing_disconnect() { let mut app = TestApp::default(); let mut included = Vec::new(); - dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 16, &mut included) + let head = unbounded_head(); + dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 16, &head, &mut included) .expect("should flush processed user ops before disconnect"); assert_eq!(included.len(), 1); } +#[test] +fn dequeue_returns_lane_error_when_app_reports_internal() { + let (tx, mut rx) = mpsc::channel::(1); + let (pending, recv) = make_pending_user_op(0x45); + tx.blocking_send(pending).expect("enqueue pending user op"); + + let mut app = InternalUserOpApp; + let mut included = Vec::new(); + let head = unbounded_head(); + let err = dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 16, &head, &mut included) + .expect_err("internal application error should stop the lane"); + + assert!(matches!(err, InclusionLaneError::ExecuteUserOp { .. })); + assert!( + included.is_empty(), + "internal errors must not leave an op ready to persist" + ); + let response = recv + .blocking_recv() + .expect("lane should respond to triggering op") + .expect_err("triggering op should receive internal error"); + assert!(matches!( + response, + super::SequencerError::Internal(message) if message == "app invariant failed" + )); +} + #[test] fn catch_up_replays_multiple_pages() { let db = temp_db("catch-up-multi-page"); let expected = seed_replay_fixture(db.path.as_str()); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); let mut app = ReplayRecordingApp::default(); catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 2) @@ -704,7 +776,7 @@ fn catch_up_replays_multiple_pages() { fn catch_up_replays_from_storage_even_when_app_reports_executed_inputs() { let db = temp_db("catch-up-offset"); let expected = seed_replay_fixture(db.path.as_str()); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); let mut app = ReplayRecordingApp::with_executed_input_count(3); catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 2) @@ -718,7 +790,7 @@ fn catch_up_replays_from_storage_even_when_app_reports_executed_inputs() { fn catch_up_handles_mixed_user_ops_and_direct_inputs_across_page_boundary() { let db = temp_db("catch-up-mixed-page-boundary"); let expected = seed_replay_fixture(db.path.as_str()); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); let mut app = ReplayRecordingApp::default(); catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 4) @@ -730,8 +802,7 @@ fn catch_up_handles_mixed_user_ops_and_direct_inputs_across_page_boundary() { #[test] fn catch_up_load_error_reports_offset() { let db = temp_db("catch-up-load-error"); - let mut storage = - Storage::open_without_migrations(db.path.as_str(), "NORMAL").expect("open raw storage"); + let mut storage = Storage::open_without_migrations(db.path.as_str()).expect("open raw storage"); let mut app = ReplayRecordingApp::default(); let err = catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 2) diff --git a/sequencer/src/inclusion_lane/types.rs b/sequencer/src/ingress/inclusion_lane/types.rs similarity index 58% rename from sequencer/src/inclusion_lane/types.rs rename to sequencer/src/ingress/inclusion_lane/types.rs index 535dc89..b113db0 100644 --- a/sequencer/src/inclusion_lane/types.rs +++ b/sequencer/src/ingress/inclusion_lane/types.rs @@ -1,12 +1,17 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Cross-module types: the unit of work the API hands the lane, and the +//! per-op outcome the lane sends back through the response channel. + use std::time::SystemTime; use sequencer_core::user_op::SignedUserOp; use thiserror::Error; use tokio::sync::oneshot; +/// A signed user op accepted by the API and queued for the inclusion lane. +/// The lane sends the inclusion outcome back through `respond_to`. #[derive(Debug)] pub struct PendingUserOp { pub signed: SignedUserOp, @@ -14,6 +19,11 @@ pub struct PendingUserOp { pub received_at: SystemTime, } +/// Per-op outcome reported back to the API caller via the response channel. +/// +/// - `Invalid` — application rejected the op (nonce mismatch, fee too low, etc.); maps to HTTP 4xx. +/// - `Unavailable` — sequencer can't currently accept (shutting down, queue full); maps to HTTP 503/429. +/// - `Internal` — bug or unrecoverable failure; maps to HTTP 500. #[derive(Debug, Error, Clone)] pub enum SequencerError { #[error("{0}")] diff --git a/sequencer/src/ingress/mod.rs b/sequencer/src/ingress/mod.rs new file mode 100644 index 0000000..3795ac2 --- /dev/null +++ b/sequencer/src/ingress/mod.rs @@ -0,0 +1,9 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Inbound side: HTTP submit endpoint and the inclusion lane that consumes its +//! queue. The submit API is the public-facing port; the lane is the only writer +//! of open batch/frame state in storage. + +pub mod api; +pub mod inclusion_lane; diff --git a/sequencer/src/input_reader/mod.rs b/sequencer/src/input_reader/mod.rs deleted file mode 100644 index 46fc0d9..0000000 --- a/sequencer/src/input_reader/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -//! Reads safe InputBox inputs from a reference source (e.g. InputBox contract) and appends them -//! to sequencer storage. Minimal design: no epochs or consensus; flat contiguous indices only. - -mod reader; - -pub use reader::{InputReader, InputReaderConfig, InputReaderError}; diff --git a/sequencer/src/l1/mod.rs b/sequencer/src/l1/mod.rs new file mode 100644 index 0000000..9300410 --- /dev/null +++ b/sequencer/src/l1/mod.rs @@ -0,0 +1,11 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! L1 client surface: reads InputBox events into storage (`reader`), submits +//! batches back out (`submitter`), and shares L1 utilities (`provider`, +//! `partition`). + +pub mod partition; +pub mod provider; +pub mod reader; +pub mod submitter; diff --git a/sequencer/src/partition.rs b/sequencer/src/l1/partition.rs similarity index 100% rename from sequencer/src/partition.rs rename to sequencer/src/l1/partition.rs diff --git a/sequencer/src/l1/provider.rs b/sequencer/src/l1/provider.rs new file mode 100644 index 0000000..672c8b9 --- /dev/null +++ b/sequencer/src/l1/provider.rs @@ -0,0 +1,156 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +use std::str::FromStr; +use std::time::Duration; + +use alloy::{ + providers::{DynProvider, Provider, ProviderBuilder}, + rpc::client::RpcClient, + signers::local::PrivateKeySigner, + transports::http::{Http, reqwest, reqwest::Url}, +}; +use alloy_transport::layers::RetryBackoffLayer; + +// Public Ethereum providers (Infura, Alchemy) commonly take 30–60s on heavy +// `eth_getLogs` queries under load. The partition-retry helper in +// `l1/partition.rs` only kicks on RPC error codes (e.g. -32005), not on +// transport timeouts — a request that silently chews past the timeout slips +// past partitioning. 60s is long enough that hitting it signals a genuine +// problem rather than a slow query. +const REQUEST_TIMEOUT: Duration = Duration::from_secs(60); +const MAX_RATE_LIMIT_RETRIES: u32 = 5; +const INITIAL_BACKOFF_MS: u64 = 200; +const COMPUTE_UNITS_PER_SEC: u64 = 500; + +fn create_client(url: &str) -> Result { + let url = Url::parse(url).map_err(|e| format!("invalid RPC URL: {e}"))?; + + // Reject non-HTTPS for remote hosts to prevent accidental plaintext RPC. + // `url::Url::host_str` returns bracket-wrapped IPv6 literals (e.g. "[::1]"). + if url.scheme() != "https" && !is_loopback_host(url.host_str().unwrap_or("")) { + return Err(format!( + "remote RPC must use https, got {}://", + url.scheme() + )); + } + + let http_client = reqwest::Client::builder() + .timeout(REQUEST_TIMEOUT) + .build() + .map_err(|e| format!("failed to build HTTP client: {e}"))?; + + let transport = Http::with_client(http_client, url); + let is_local = transport.guess_local(); + + let retry = RetryBackoffLayer::new( + MAX_RATE_LIMIT_RETRIES, + INITIAL_BACKOFF_MS, + COMPUTE_UNITS_PER_SEC, + ); + + Ok(RpcClient::builder() + .layer(retry) + .transport(transport, is_local)) +} + +/// Check whether a URL host string refers to a loopback address. +/// +/// `url::Url::host_str` wraps IPv6 literals in brackets (e.g. `[::1]`), which +/// this helper normalizes alongside the IPv4 and DNS forms. +fn is_loopback_host(host: &str) -> bool { + matches!(host, "localhost" | "127.0.0.1" | "::1" | "[::1]") +} + +/// Create a read-only provider with retry and timeout. +pub fn create_provider(url: &str) -> Result { + let client = create_client(url)?; + let provider = ProviderBuilder::new().connect_client(client); + Ok(provider.erased()) +} + +/// Create a provider with a wallet signer, retry, and timeout. +pub fn create_signer_provider(url: &str, private_key: &str) -> Result { + let client = create_client(url)?; + let signer = + PrivateKeySigner::from_str(private_key).map_err(|_| "invalid private key".to_string())?; + let provider = ProviderBuilder::new().wallet(signer).connect_client(client); + Ok(provider.erased()) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── H4 regression: URL scheme enforcement ───────────── + + #[test] + fn create_client_rejects_http_for_remote_host() { + let err = create_client("http://mainnet.infura.io/v3/abc123") + .expect_err("http:// for remote host must be rejected"); + assert!( + err.contains("https"), + "error should explain https requirement, got: {err}" + ); + } + + #[test] + fn create_client_accepts_http_for_127_0_0_1() { + create_client("http://127.0.0.1:8545").expect("loopback http:// must be accepted"); + } + + #[test] + fn create_client_accepts_http_for_localhost() { + create_client("http://localhost:8545").expect("localhost http:// must be accepted"); + } + + #[test] + fn create_client_accepts_http_for_ipv6_loopback() { + create_client("http://[::1]:8545").expect("IPv6 loopback http:// must be accepted"); + } + + #[test] + fn create_client_accepts_https_for_remote_host() { + create_client("https://mainnet.infura.io/v3/abc123").expect("https:// must be accepted"); + } + + // ── H3 regression: private-key parse error must not echo bytes ─ + + #[test] + fn create_signer_provider_does_not_echo_key_bytes_on_invalid_hex() { + // A malformed key that would otherwise cause alloy's error Display to + // embed a character from the input. The fix replaced {e} with a fixed + // string. Assert the error is the fixed string exactly — not a prefix + // match — so a future change that re-adds interpolation is caught. + let bad_key = + "0xZZZZ_zzzz_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff"; + let err = create_signer_provider("http://127.0.0.1:8545", bad_key) + .expect_err("malformed hex key must be rejected"); + assert_eq!( + err, "invalid private key", + "error message must be the fixed constant — no key bytes, no hex excerpt" + ); + // Belt-and-suspenders: no characters from the bad key should appear. + assert!( + !err.contains('Z') && !err.contains('z') && !err.contains('f'), + "error must not reflect any byte of the input key: {err}" + ); + } + + #[test] + fn create_signer_provider_does_not_echo_key_bytes_on_odd_length() { + // Odd-length hex would trigger a different error variant. Same + // invariant: fixed error message, no key bytes leaked. + let bad_key = "0xabc"; + let err = create_signer_provider("http://127.0.0.1:8545", bad_key) + .expect_err("odd-length hex key must be rejected"); + assert_eq!(err, "invalid private key"); + } + + #[test] + fn create_signer_provider_accepts_valid_key() { + let good_key = "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80"; + create_signer_provider("http://127.0.0.1:8545", good_key) + .expect("valid key must be accepted"); + } +} diff --git a/sequencer/src/input_reader/reader.rs b/sequencer/src/l1/reader.rs similarity index 61% rename from sequencer/src/input_reader/reader.rs rename to sequencer/src/l1/reader.rs index b157f81..64afb25 100644 --- a/sequencer/src/input_reader/reader.rs +++ b/sequencer/src/l1/reader.rs @@ -1,6 +1,9 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Reads safe InputBox events from L1 and appends them to sequencer storage. +//! Minimal design: no epochs or consensus; flat contiguous indices only. + use std::time::Duration; use alloy::eips::BlockNumberOrTag::Safe; @@ -13,13 +16,12 @@ use cartesi_rollups_contracts::data_availability::DataAvailability::{ }; use cartesi_rollups_contracts::input_box::InputBox; use tokio::task::JoinHandle; -use tracing::{info, warn}; +use tracing::info; -use crate::partition::{decode_evm_advance_input, get_input_added_events}; -use crate::shutdown::ShutdownSignal; +use crate::l1::partition::{decode_evm_advance_input, get_input_added_events}; +use crate::runtime::shutdown::ShutdownSignal; use crate::storage::{Storage, StorageOpenError, StoredSafeInput}; - -const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; +use sequencer_core::protocol::ProtocolTiming; #[derive(Debug, Clone)] pub struct InputReaderConfig { @@ -34,6 +36,8 @@ pub struct InputReaderConfig { pub enum InputReaderError { #[error("provider/transport: {0}")] Provider(String), + #[error("bootstrap: {0}")] + Bootstrap(String), #[error(transparent)] OpenStorage(#[from] StorageOpenError), #[error(transparent)] @@ -47,23 +51,30 @@ pub struct InputReader { input_box_address: Address, genesis_block: u64, db_path: String, - shutdown: ShutdownSignal, + /// Scheduler-acceptance identity — passed into + /// [`Storage::append_safe_inputs`] so the persisted scheduler-accepted + /// frontier filters by the right sender. + batch_submitter: Address, + /// Protocol timing used to keep `safe_accepted_batches` consistent with + /// every `append_safe_inputs` write. + timing: ProtocolTiming, } impl InputReader { pub async fn new( db_path: impl Into, - shutdown: ShutdownSignal, config: InputReaderConfig, + batch_submitter: Address, + timing: ProtocolTiming, ) -> Result { - let provider = crate::provider::create_provider(&config.rpc_url) - .map_err(InputReaderError::Provider)?; + let provider = crate::l1::provider::create_provider(&config.rpc_url) + .map_err(InputReaderError::Bootstrap)?; let application = Application::new(config.app_address, &provider); let data_availability = application .getDataAvailability() .call() .await - .map_err(|e| InputReaderError::Provider(e.to_string()))?; + .map_err(map_contract_bootstrap_error)?; let input_box_address = decode_input_box_address(&data_availability)?; let input_box = InputBox::new(input_box_address, &provider); @@ -71,10 +82,10 @@ impl InputReader { .getDeploymentBlockNumber() .call() .await - .map_err(|e| InputReaderError::Provider(e.to_string()))? + .map_err(map_contract_bootstrap_error)? .try_into() .map_err(|_| { - InputReaderError::Provider( + InputReaderError::Bootstrap( "input box deployment block number did not fit into u64".to_string(), ) })?; @@ -84,23 +95,26 @@ impl InputReader { input_box_address, genesis_block, db_path.into(), - shutdown, + batch_submitter, + timing, )) } - fn from_parts( + pub fn from_parts( config: InputReaderConfig, input_box_address: Address, genesis_block: u64, db_path: String, - shutdown: ShutdownSignal, + batch_submitter: Address, + timing: ProtocolTiming, ) -> Self { Self { config, input_box_address, genesis_block, db_path, - shutdown, + batch_submitter, + timing, } } @@ -112,42 +126,55 @@ impl InputReader { self.genesis_block } - pub fn start(self) -> Result>, StorageOpenError> { - let _ = Storage::open(self.db_path.as_str(), SQLITE_SYNCHRONOUS_PRAGMA)?; - Ok(tokio::spawn(async move { self.run_forever().await })) + /// Spawn the worker loop. The `shutdown` signal is what the loop respects; + /// passing it at start time (instead of construction time) keeps the + /// construction phase pure and ensures the same instance can't accidentally + /// be started under two different shutdown signals. + pub fn start( + self, + shutdown: ShutdownSignal, + ) -> Result>, StorageOpenError> { + let _ = Storage::open(self.db_path.as_str())?; + Ok(tokio::spawn( + async move { self.run_forever(shutdown).await }, + )) } pub async fn sync_to_current_safe_head(&mut self) -> Result<(), InputReaderError> { - self.bootstrap_safe_head().await?; - - let provider = crate::provider::create_provider(&self.config.rpc_url) - .map_err(InputReaderError::Provider)?; + let provider = crate::l1::provider::create_provider(&self.config.rpc_url) + .map_err(InputReaderError::Bootstrap)?; self.advance_once(&provider).await } - async fn run_forever(mut self) -> Result<(), InputReaderError> { - self.bootstrap_safe_head().await?; - - let provider = crate::provider::create_provider(&self.config.rpc_url) - .map_err(InputReaderError::Provider)?; + /// Top-level driver. Races the work loop against the shutdown signal. + /// + /// `biased;` polls the shutdown arm first on every wakeup so a concurrent + /// shutdown wins over an in-flight `run_loop` step. Without `biased`, + /// `select!` would pick randomly between two ready branches and could + /// process one more iteration before shutting down. + async fn run_forever(self, shutdown: ShutdownSignal) -> Result<(), InputReaderError> { + tokio::select! { + biased; + _ = shutdown.wait_for_shutdown() => Ok(()), + result = self.run_loop() => result, + } + } + /// Tick → sleep → tick. Provider errors are logged and retried; other + /// errors propagate. Shutdown is handled by the outer `run_forever` + /// select, so this loop has no shutdown concerns. + async fn run_loop(mut self) -> Result<(), InputReaderError> { + let provider = crate::l1::provider::create_provider(&self.config.rpc_url) + .map_err(InputReaderError::Bootstrap)?; loop { - if self.shutdown.is_shutdown_requested() { - return Ok(()); - } - match self.advance_once(&provider).await { Ok(()) => {} Err(InputReaderError::Provider(error)) => { - warn!(error, "input reader advance failed, will retry"); + tracing::error!(error, "L1 provider error in input reader — will retry"); } Err(err) => return Err(err), } - - tokio::select! { - _ = self.shutdown.wait_for_shutdown() => return Ok(()), - _ = tokio::time::sleep(self.config.poll_interval) => {} - } + tokio::time::sleep(self.config.poll_interval).await; } } @@ -155,16 +182,25 @@ impl InputReader { &mut self, provider: &impl Provider, ) -> Result<(), InputReaderError> { - let current_safe_block = latest_safe_block(provider).await?; + let current_safe_head = latest_safe_head(provider).await?; + let current_safe_block = current_safe_head.block_number; let previous_safe_block = self.current_safe_block().await?; + let scan_floor = + previous_safe_block.unwrap_or_else(|| self.genesis_block.saturating_sub(1)); // If our persisted safe head is already at the current safe frontier, - // there is nothing new to scan. - if current_safe_block <= previous_safe_block { + // there is nothing new to scan. On the first observation we still + // persist the real safe head so storage distinguishes "observed L1" + // from "no L1 view yet". + if current_safe_block <= scan_floor { + if previous_safe_block.is_none() { + self.append_safe_inputs(current_safe_head, Vec::new()) + .await?; + } return Ok(()); } - let start_block = previous_safe_block + 1; + let start_block = scan_floor + 1; let events = get_input_added_events( provider, self.config.app_address, @@ -211,42 +247,37 @@ impl InputReader { "appending safe inputs" ); - self.append_safe_inputs(current_safe_block, batch).await + self.append_safe_inputs(current_safe_head, batch).await } - async fn current_safe_block(&self) -> Result { + async fn current_safe_block(&self) -> Result, InputReaderError> { let db_path = self.db_path.clone(); tokio::task::spawn_blocking(move || { - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; + let mut storage = Storage::open(&db_path)?; storage.current_safe_block().map_err(InputReaderError::from) }) .await .map_err(|err| InputReaderError::Join(err.to_string()))? } - async fn bootstrap_safe_head(&self) -> Result<(), InputReaderError> { - let db_path = self.db_path.clone(); - let minimum_safe_block = self.genesis_block.saturating_sub(1); - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; - storage - .ensure_minimum_safe_block(minimum_safe_block) - .map_err(InputReaderError::from) - }) - .await - .map_err(|err| InputReaderError::Join(err.to_string()))? - } - async fn append_safe_inputs( &self, - current_safe_block: u64, + current_safe_head: SafeHead, batch: Vec, ) -> Result<(), InputReaderError> { let db_path = self.db_path.clone(); + let batch_submitter = self.batch_submitter; + let timing = self.timing; tokio::task::spawn_blocking(move || { - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; + let mut storage = Storage::open(&db_path)?; storage - .append_safe_inputs(current_safe_block, &batch) + .append_safe_inputs_with_timestamp( + current_safe_head.block_number, + current_safe_head.block_timestamp, + &batch, + batch_submitter, + &timing, + ) .map_err(InputReaderError::from) }) .await @@ -256,7 +287,7 @@ impl InputReader { fn decode_input_box_address(data_availability: &[u8]) -> Result { let call = DataAvailabilityCalls::abi_decode(data_availability).map_err(|err| { - InputReaderError::Provider(format!( + InputReaderError::Bootstrap(format!( "application getDataAvailability returned invalid DataAvailability calldata: {err}" )) })?; @@ -267,19 +298,35 @@ fn decode_input_box_address(data_availability: &[u8]) -> Result Err(InputReaderError::Provider(format!( + }) => Err(InputReaderError::Bootstrap(format!( "application getDataAvailability returned unsupported DataAvailability.InputBoxAndEspresso(inputBox={inputBox}, fromBlock={fromBlock}, namespaceId={namespaceId})" ))), } } -async fn latest_safe_block(provider: &impl Provider) -> Result { +fn map_contract_bootstrap_error(err: alloy::contract::Error) -> InputReaderError { + match err { + alloy::contract::Error::TransportError(_) => InputReaderError::Provider(err.to_string()), + _ => InputReaderError::Bootstrap(err.to_string()), + } +} + +#[derive(Debug, Clone, Copy)] +struct SafeHead { + block_number: u64, + block_timestamp: u64, +} + +async fn latest_safe_head(provider: &impl Provider) -> Result { let block = provider .get_block(Safe.into()) .await .map_err(|e| InputReaderError::Provider(e.to_string()))? .ok_or_else(|| InputReaderError::Provider("get_block returned None".to_string()))?; - Ok(block.header.number) + Ok(SafeHead { + block_number: block.header.number, + block_timestamp: block.header.timestamp, + }) } #[cfg(test)] @@ -289,12 +336,20 @@ mod tests { use alloy::sol_types::SolCall; use tempfile::NamedTempFile; + fn test_timing() -> ProtocolTiming { + ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + } + } + fn test_reader( db_path: String, rpc_url: String, genesis_block: u64, poll_interval: Duration, - shutdown: ShutdownSignal, ) -> InputReader { InputReader::from_parts( InputReaderConfig { @@ -306,12 +361,22 @@ mod tests { Address::ZERO, genesis_block, db_path, - shutdown, + Address::ZERO, + test_timing(), ) } - fn require_anvil_tests() -> bool { - std::env::var_os("RUN_ANVIL_TESTS").is_some() + /// Verify that `anvil` is available. Panics with a clear message if not found. + fn require_anvil() { + assert!( + std::process::Command::new("anvil") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(), + "anvil not found on PATH — install Foundry (https://getfoundry.sh)" + ); } #[tokio::test] @@ -323,9 +388,8 @@ mod tests { "http://127.0.0.1:0".to_string(), 0, Duration::from_millis(20), - shutdown.clone(), ); - let handle = reader.start().expect("start input reader"); + let handle = reader.start(shutdown.clone()).expect("start input reader"); shutdown.request_shutdown(); let join_result = tokio::time::timeout(Duration::from_secs(2), handle).await; @@ -339,9 +403,7 @@ mod tests { #[tokio::test] async fn start_with_anvil_request_shutdown_then_join_returns_ok() { - if !require_anvil_tests() { - return; - } + require_anvil(); let anvil = Anvil::default().block_time(1).timeout(30_000).spawn(); let shutdown = ShutdownSignal::default(); @@ -351,9 +413,8 @@ mod tests { anvil.endpoint_url().to_string(), 0, Duration::from_millis(50), - shutdown.clone(), ); - let handle = reader.start().expect("start input reader"); + let handle = reader.start(shutdown.clone()).expect("start input reader"); tokio::time::sleep(Duration::from_millis(200)).await; shutdown.request_shutdown(); @@ -369,9 +430,7 @@ mod tests { #[tokio::test] async fn advance_once_with_anvil_updates_safe_head_when_block_available() { - if !require_anvil_tests() { - return; - } + require_anvil(); let anvil = Anvil::default().block_time(1).timeout(30_000).spawn(); let db_file = NamedTempFile::new().expect("temp file"); @@ -380,7 +439,6 @@ mod tests { anvil.endpoint_url().to_string(), 0, Duration::from_secs(1), - ShutdownSignal::default(), ); let provider = alloy::providers::ProviderBuilder::new() .connect(anvil.endpoint_url().to_string().as_str()) @@ -390,19 +448,19 @@ mod tests { reader.advance_once(&provider).await.expect("advance_once"); let safe_block = reader.current_safe_block().await.expect("read safe block"); let safe_end = { - let mut storage = Storage::open( - db_file.path().to_string_lossy().as_ref(), - SQLITE_SYNCHRONOUS_PRAGMA, - ) - .expect("open storage"); + let mut storage = + Storage::open(db_file.path().to_string_lossy().as_ref()).expect("open storage"); storage.safe_input_end_exclusive().expect("safe end") }; assert_eq!(safe_end, 0, "no InputAdded contract so no direct inputs"); - let _ = safe_block; + assert!( + safe_block.is_some(), + "first successful safe-head observation should create the row" + ); } #[tokio::test] - async fn advance_once_with_genesis_block_uses_genesis_as_effective_prev() { + async fn current_safe_block_is_unknown_before_first_observation() { let db_file = NamedTempFile::new().expect("temp file"); let genesis_block = 2_u64; let reader = test_reader( @@ -410,20 +468,14 @@ mod tests { "http://127.0.0.1:0".to_string(), genesis_block, Duration::from_secs(1), - ShutdownSignal::default(), ); - reader - .bootstrap_safe_head() - .await - .expect("bootstrap safe head"); - let safe_block = reader.current_safe_block().await.expect("read safe block"); - assert_eq!(safe_block, genesis_block - 1); + assert_eq!(safe_block, None); } #[tokio::test] - async fn sync_to_current_safe_head_with_genesis_block_bootstraps_safe_head() { + async fn sync_to_current_safe_head_failure_leaves_safe_head_unknown() { let db_file = NamedTempFile::new().expect("temp file"); let genesis_block = 5_u64; let mut reader = test_reader( @@ -431,44 +483,68 @@ mod tests { "http://127.0.0.1:0".to_string(), genesis_block, Duration::from_secs(1), - ShutdownSignal::default(), ); let result = reader.sync_to_current_safe_head().await; assert!(matches!(result, Err(InputReaderError::Provider(_)))); - let mut storage = Storage::open( - db_file.path().to_string_lossy().as_ref(), - SQLITE_SYNCHRONOUS_PRAGMA, - ) - .expect("open storage"); + let mut storage = + Storage::open(db_file.path().to_string_lossy().as_ref()).expect("open storage"); assert_eq!( storage.current_safe_block().expect("read safe block"), - genesis_block - 1 + None, + "failed sync must not create a synthetic safe-head row" ); } #[tokio::test] - async fn advance_once_when_safe_head_ahead_of_chain_is_no_op() { - if !require_anvil_tests() { - return; + async fn new_with_invalid_rpc_url_returns_bootstrap_error() { + let db_file = NamedTempFile::new().expect("temp file"); + + let result = InputReader::new( + db_file.path().to_string_lossy().into_owned(), + InputReaderConfig { + rpc_url: "not-a-valid-url".to_string(), + app_address: Address::ZERO, + poll_interval: Duration::from_secs(1), + long_block_range_error_codes: Vec::new(), + }, + Address::ZERO, + test_timing(), + ) + .await; + + match result { + Err(InputReaderError::Bootstrap(_)) => {} + Err(other) => panic!("expected bootstrap error, got {other:?}"), + Ok(_) => panic!("invalid RPC URL should fail during bootstrap"), } + } + + #[tokio::test] + async fn advance_once_when_safe_head_ahead_of_chain_is_no_op() { + require_anvil(); let anvil = Anvil::default().block_time(1).timeout(30_000).spawn(); let db_file = NamedTempFile::new().expect("temp file"); let db_path = db_file.path().to_string_lossy().into_owned(); - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); + let mut storage = Storage::open(&db_path).expect("open storage"); + let timing = test_timing(); storage - .append_safe_inputs(1000, &[]) + .append_safe_inputs(1000, &[], Address::ZERO, &timing) .expect("set safe head ahead of chain"); + let recorded_sync = storage + .last_safe_progress_ms() + .expect("read safe-progress timestamp") + .expect("append_safe_inputs should stamp safe progress"); + drop(storage); let mut reader = test_reader( db_path, anvil.endpoint_url().to_string(), 0, Duration::from_secs(1), - ShutdownSignal::default(), ); let provider = alloy::providers::ProviderBuilder::new() .connect(anvil.endpoint_url().to_string().as_str()) @@ -478,9 +554,19 @@ mod tests { reader.advance_once(&provider).await.expect("advance_once"); assert_eq!( reader.current_safe_block().await.expect("read"), - 1000, + Some(1000), "safe head should remain unchanged when already ahead of chain" ); + + let storage = + Storage::open(db_file.path().to_string_lossy().as_ref()).expect("re-open storage"); + assert_eq!( + storage + .last_safe_progress_ms() + .expect("read unchanged safe-progress timestamp"), + Some(recorded_sync), + "same-head polls must not refresh the safe-progress marker" + ); } #[test] diff --git a/sequencer/src/batch_submitter/config.rs b/sequencer/src/l1/submitter/config.rs similarity index 53% rename from sequencer/src/batch_submitter/config.rs rename to sequencer/src/l1/submitter/config.rs index 6b0fd48..beddd9d 100644 --- a/sequencer/src/batch_submitter/config.rs +++ b/sequencer/src/l1/submitter/config.rs @@ -3,9 +3,14 @@ use std::time::Duration; -/// Batch-submitter-specific options. L1 RPC URL and InputBox address are shared with the -/// input reader and come from the same discovery at startup (see `L1Config` in `config`). -/// These fields are parsed as part of `RunConfig` and passed through at runtime. +/// Batch-submitter-specific options. L1 RPC URL and InputBox address are shared +/// with the input reader and come from the same discovery at startup (see +/// `L1Config` in `config`). These fields are parsed as part of `RunConfig` and +/// passed through at runtime. +/// +/// Danger-zone tuning (`max_wait_blocks`, `preemptive_margin_blocks`, +/// `seconds_per_block`) lives in `ProtocolTiming`, not here — the submitter +/// doesn't read it. The [`crate::recovery::DangerDetector`] worker owns that. #[derive(Debug, Clone)] pub struct BatchSubmitterConfig { /// How often the submitter polls for new work when idle. diff --git a/sequencer/src/l1/submitter/mod.rs b/sequencer/src/l1/submitter/mod.rs new file mode 100644 index 0000000..7f53823 --- /dev/null +++ b/sequencer/src/l1/submitter/mod.rs @@ -0,0 +1,17 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Batch submitter: posts closed batches to L1 with at-least-once semantics. +//! +//! Each valid closed batch has a structural nonce (`batches.nonce`, set at +//! creation time as `parent.nonce + 1`). The scheduler checks that nonces are +//! strictly increasing and skips otherwise, so duplicates are deduplicated at +//! the scheduler level. See `worker` for the tick loop. + +mod config; +mod poster; +mod worker; + +pub use config::BatchSubmitterConfig; +pub use poster::{BatchPoster, BatchPosterConfig, BatchPosterError, EthereumBatchPoster, TxHash}; +pub use worker::{BatchSubmitter, BatchSubmitterError, SubmitterExit}; diff --git a/sequencer/src/l1/submitter/poster.rs b/sequencer/src/l1/submitter/poster.rs new file mode 100644 index 0000000..207d82a --- /dev/null +++ b/sequencer/src/l1/submitter/poster.rs @@ -0,0 +1,347 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +use alloy::providers::{ + DynProvider, PendingTransactionBuilder, PendingTransactionConfig, PendingTransactionError, + Provider, +}; +use alloy::rpc::types::BlockNumberOrTag; +use async_trait::async_trait; +use cartesi_rollups_contracts::input_box::InputBox; +use sequencer_core::batch::Batch; +use thiserror::Error; +use tracing::{debug, info, warn}; + +use crate::l1::partition::{decode_evm_advance_input, get_input_added_events}; + +pub type TxHash = alloy_primitives::B256; + +#[derive(Debug, Clone)] +pub struct BatchPosterConfig { + pub l1_submit_address: alloy_primitives::Address, + pub app_address: alloy_primitives::Address, + pub batch_submitter_address: alloy_primitives::Address, + pub start_block: u64, + pub confirmation_depth: u64, + /// Assumed L1 block time in seconds, used to derive a conservative + /// confirmation timeout for watched batch-submission txs. + pub seconds_per_block: u64, + /// Error codes that trigger `get_logs` retries with a shorter block range. + pub long_block_range_error_codes: Vec, +} + +#[derive(Debug, Error)] +pub enum BatchPosterError { + #[error("provider/transport: {0}")] + Provider(String), +} + +#[async_trait] +pub trait BatchPoster: Send + Sync { + async fn submit_batches(&self, payloads: Vec>) + -> Result, BatchPosterError>; + + async fn observed_submitted_batch_nonces( + &self, + from_block: u64, + ) -> Result, BatchPosterError>; +} + +#[derive(Clone)] +pub struct EthereumBatchPoster { + provider: DynProvider, + config: BatchPosterConfig, +} + +impl EthereumBatchPoster { + pub fn new(provider: DynProvider, config: BatchPosterConfig) -> Self { + Self { provider, config } + } + + /// Conservative upper-bound timeout for waiting on confirmations, derived + /// from the configured block time. Shorter block times on other chains just + /// make the watch complete sooner. + fn confirmation_timeout(&self) -> std::time::Duration { + derive_confirmation_timeout( + self.config.confirmation_depth, + self.config.seconds_per_block, + ) + } + + async fn latest_account_nonce(&self) -> Result { + self.provider + .get_transaction_count(self.config.batch_submitter_address) + .block_id(BlockNumberOrTag::Latest.into()) + .await + .map_err(|err| BatchPosterError::Provider(err.to_string())) + } + + async fn send_batch_at_nonce( + &self, + payload: Vec, + nonce: u64, + fees: &alloy::providers::utils::Eip1559Estimation, + ) -> Result, BatchPosterError> { + let input_box = InputBox::new(self.config.l1_submit_address, &self.provider); + input_box + .addInput(self.config.app_address, payload.into()) + .nonce(nonce) + .max_fee_per_gas(fees.max_fee_per_gas) + .max_priority_fee_per_gas(fees.max_priority_fee_per_gas) + .send() + .await + .map_err(|err| BatchPosterError::Provider(err.to_string())) + } + + /// Wait serially for each tx to reach `confirmation_depth + 1` confirmations. + /// + /// **Serial is not a performance concession; it's correct.** Ethereum mines + /// transactions from a single EOA in strict wallet-nonce order: tx[k] cannot + /// land on-chain until tx[k-1] has landed. So: + /// + /// - If tx[0] times out, tx[1..] cannot have been mined either; watching + /// them is provably pointless. We return `Ok(())` early and let the next + /// tick retry the whole sequence. + /// - If tx[0] confirms, tx[1] was blocked only on tx[0] and is unblocked by + /// the time we start watching it. + /// + /// Timeouts return `Ok(())` rather than `Err` because the safe response is + /// "re-enter `submit_batches` on the next tick" — which re-estimates fees + /// (natural replacement bump) and re-submits at the same wallet nonces. The + /// wallet-nonce ordering invariant above guarantees we cannot accidentally + /// skip work by returning early here. + async fn wait_for_confirmations(&self, tx_hashes: &[TxHash]) -> Result<(), BatchPosterError> { + let timeout = self.confirmation_timeout(); + for tx_hash in tx_hashes { + let watch = PendingTransactionConfig::new(*tx_hash) + .with_required_confirmations(self.config.confirmation_depth.saturating_add(1)) + .with_timeout(Some(timeout)) + .with_provider(self.provider.root().clone()); + match watch.watch().await { + Ok(_) => { + info!( + %tx_hash, + confirmation_depth = self.config.confirmation_depth, + required_confirmations = self.config.confirmation_depth.saturating_add(1), + "batch submission confirmed on L1" + ); + } + Err(PendingTransactionError::TxWatcher( + alloy::providers::WatchTxError::Timeout, + )) => { + warn!( + %tx_hash, + confirmation_depth = self.config.confirmation_depth, + timeout_secs = timeout.as_secs(), + "timed out waiting for batch submission confirmations; next tick will retry under fresher state" + ); + return Ok(()); + } + Err(err) => return Err(BatchPosterError::Provider(err.to_string())), + } + } + + Ok(()) + } +} + +fn derive_confirmation_timeout( + confirmation_depth: u64, + seconds_per_block: u64, +) -> std::time::Duration { + let blocks_to_wait = confirmation_depth.saturating_add(1).saturating_mul(2); + std::time::Duration::from_secs(blocks_to_wait.saturating_mul(seconds_per_block)) +} + +#[async_trait] +impl BatchPoster for EthereumBatchPoster { + async fn submit_batches( + &self, + payloads: Vec>, + ) -> Result, BatchPosterError> { + if payloads.is_empty() { + return Ok(Vec::new()); + } + + let fees = self + .provider + .estimate_eip1559_fees() + .await + .map_err(|err| BatchPosterError::Provider(err.to_string()))?; + let mut next_nonce = self.latest_account_nonce().await?; + let mut tx_hashes = Vec::with_capacity(payloads.len()); + + for payload in payloads { + let pending = self.send_batch_at_nonce(payload, next_nonce, &fees).await?; + let tx_hash = *pending.tx_hash(); + debug!( + tx_nonce = next_nonce, + %tx_hash, + confirmation_depth = self.config.confirmation_depth, + "sent batch submission tx to L1" + ); + tx_hashes.push(tx_hash); + next_nonce = next_nonce.saturating_add(1); + } + + self.wait_for_confirmations(tx_hashes.as_slice()).await?; + Ok(tx_hashes) + } + + async fn observed_submitted_batch_nonces( + &self, + from_block: u64, + ) -> Result, BatchPosterError> { + let latest = self + .provider + .get_block_number() + .await + .map_err(|err| BatchPosterError::Provider(err.to_string()))?; + let start_block = from_block.max(self.config.start_block); + if start_block > latest { + return Ok(Vec::new()); + } + + let events = get_input_added_events( + &self.provider, + self.config.app_address, + &self.config.l1_submit_address, + start_block, + latest, + self.config.long_block_range_error_codes.as_slice(), + ) + .await + .map_err(|errs| { + BatchPosterError::Provider( + errs.into_iter() + .next() + .map(|e| e.to_string()) + .unwrap_or_default(), + ) + })?; + + let mut observed_nonces = Vec::new(); + for (event, _log) in events { + let evm_advance = decode_evm_advance_input(event.input.as_ref()) + .map_err(BatchPosterError::Provider)?; + if evm_advance.msgSender != self.config.batch_submitter_address { + continue; + } + let batch: Batch = ssz::Decode::from_ssz_bytes(evm_advance.payload.as_ref()) + .map_err(|err| BatchPosterError::Provider(format!("{err:?}")))?; + observed_nonces.push(batch.nonce); + } + + Ok(observed_nonces) + } +} + +#[cfg(test)] +pub(crate) mod mock { + use super::{Batch, BatchPoster, BatchPosterError, TxHash}; + use async_trait::async_trait; + use std::sync::Mutex; + + #[derive(Debug)] + pub struct MockBatchPoster { + pub submissions: Mutex>, + pub observed_submitted_nonces: Mutex>, + pub observed_submitted_error: Mutex>, + pub last_from_block: Mutex>, + } + + impl MockBatchPoster { + pub fn new() -> Self { + Self { + submissions: Mutex::new(Vec::new()), + observed_submitted_nonces: Mutex::new(Vec::new()), + observed_submitted_error: Mutex::new(None), + last_from_block: Mutex::new(None), + } + } + + pub fn submissions(&self) -> Vec<(u64, usize)> { + self.submissions.lock().expect("lock").clone() + } + + pub fn set_observed_submitted_nonces(&self, value: Vec) { + *self.observed_submitted_nonces.lock().expect("lock") = value; + } + + pub fn set_observed_submitted_error(&self, value: Option<&str>) { + *self.observed_submitted_error.lock().expect("lock") = value.map(str::to_string); + } + + pub fn last_from_block(&self) -> Option { + *self.last_from_block.lock().expect("lock") + } + } + + #[async_trait] + impl BatchPoster for MockBatchPoster { + async fn submit_batches( + &self, + payloads: Vec>, + ) -> Result, BatchPosterError> { + let mut tx_hashes = Vec::with_capacity(payloads.len()); + for payload in payloads { + let batch_index = ssz::Decode::from_ssz_bytes(payload.as_ref()) + .map(|b: Batch| b.nonce) + .unwrap_or(0); + self.submissions + .lock() + .expect("lock") + .push((batch_index, payload.len())); + tx_hashes.push(TxHash::ZERO); + } + Ok(tx_hashes) + } + + async fn observed_submitted_batch_nonces( + &self, + from_block: u64, + ) -> Result, BatchPosterError> { + *self.last_from_block.lock().expect("lock") = Some(from_block); + if let Some(err) = self.observed_submitted_error.lock().expect("lock").clone() { + return Err(BatchPosterError::Provider(err)); + } + let configured = self.observed_submitted_nonces.lock().expect("lock").clone(); + if !configured.is_empty() { + return Ok(configured); + } + Ok(self + .submissions + .lock() + .expect("lock") + .iter() + .map(|(idx, _)| *idx) + .collect()) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::{BatchPoster, derive_confirmation_timeout, mock::MockBatchPoster}; + + #[tokio::test] + async fn mock_poster_tracks_requested_suffix_start_block() { + let poster = MockBatchPoster::new(); + let observed = poster + .observed_submitted_batch_nonces(42) + .await + .expect("observe submitted batches"); + + assert!(observed.is_empty()); + assert_eq!(poster.last_from_block(), Some(42)); + } + + #[test] + fn confirmation_timeout_derives_from_seconds_per_block() { + assert_eq!(derive_confirmation_timeout(2, 12), Duration::from_secs(72)); + assert_eq!(derive_confirmation_timeout(2, 1), Duration::from_secs(6)); + assert_eq!(derive_confirmation_timeout(5, 3), Duration::from_secs(36)); + } +} diff --git a/sequencer/src/l1/submitter/worker.rs b/sequencer/src/l1/submitter/worker.rs new file mode 100644 index 0000000..d47d885 --- /dev/null +++ b/sequencer/src/l1/submitter/worker.rs @@ -0,0 +1,513 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Batch submitter worker: stateless, at-least-once submission to L1. +//! +//! The submitter never observes danger — that is the [`crate::recovery::DangerDetector`] +//! worker's job. Each tick here is a pure "what pending work is left?" step: +//! +//! 1. Read the scheduler-accepted frontier (safe block + next-expected nonce) +//! from SQLite. Shared snapshot maintained by the input reader via +//! `append_safe_inputs`. +//! 2. Query L1 for batch submissions newer than the safe block; fold any +//! matching observed nonces to advance the local expected nonce past +//! already-mined submissions. +//! 3. Load every valid closed batch whose nonce is still past the advanced +//! frontier and submit them all in one shot. +//! +//! The outer loop is uniform: tick, maybe sleep, repeat. A tick that produced +//! submissions re-enters immediately (no sleep) so the suffix drains quickly; +//! an idle or transient-error tick sleeps `idle_poll_interval` before the next +//! attempt. +//! +//! Mid-tick cancellation is crash-safe: storage transactions either commit or +//! auto-roll-back on drop, and any already-sent L1 transaction is picked up by +//! the next startup's `observed_submitted_batch_nonces` scan. + +use std::sync::Arc; +use std::time::Duration; + +use thiserror::Error; +use tracing::{debug, error}; + +use crate::l1::submitter::{BatchPoster, BatchPosterError, BatchSubmitterConfig}; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::{PendingBatch, Storage, StorageOpenError, SubmitterFrontier}; + +#[derive(Debug, Error)] +pub enum BatchSubmitterError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("batch submitter join error: {0}")] + Join(String), + #[error(transparent)] + Poster(#[from] BatchPosterError), +} + +/// How the submitter loop exited. +/// +/// There is only one deliberate exit path (shutdown). Danger detection lives +/// in the [`crate::recovery::DangerDetector`] worker; this type does not +/// concern itself with that signal. +#[derive(Debug)] +pub enum SubmitterExit { + /// Shutdown signal fired. + Shutdown, +} + +/// Outcome of one tick. Drives the outer loop's sleep cadence. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum TickOutcome { + /// Nothing pending; sleep before the next tick. + Idle, + /// Submitted one or more batches; re-enter immediately so the suffix + /// drains without idle-sleep. + Submitted(usize), + /// Transient provider error; log and sleep before retrying. + Transient, +} + +/// Pure: given the current submitter frontier and the batch nonces we just +/// observed on L1 past that frontier, compute the nonce at which we should +/// start submitting the remaining suffix. When the observed list is empty +/// (nothing new on L1) the result is just `frontier.accepted_next_nonce`. +fn decide_submit_start(frontier: SubmitterFrontier, recently_observed_nonces: &[u64]) -> u64 { + // Fold observed nonces over the safe-accepted frontier to derive the next + // unresolved nonce. The scan starts at `safe_block + 1` (the submitter + // asks the poster for that), so wallet-nonce ordering guarantees the + // observed list mirrors our submission order. + advance_expected_batch_nonce( + frontier.accepted_next_nonce, + recently_observed_nonces.iter().copied(), + ) +} + +pub struct BatchSubmitter { + db_path: String, + poster: Arc

, + idle_poll_interval: Duration, +} + +impl BatchSubmitter

{ + pub fn new(db_path: impl Into, poster: Arc

, config: BatchSubmitterConfig) -> Self { + Self { + db_path: db_path.into(), + poster, + idle_poll_interval: config.idle_poll_interval(), + } + } + + /// Spawn the worker loop. The `shutdown` signal is what the loop respects; + /// passing it at start time (instead of construction time) keeps the + /// construction phase pure. + pub fn start( + self, + shutdown: ShutdownSignal, + ) -> Result>, StorageOpenError> + { + let _ = Storage::open_read_only(self.db_path.as_str())?; + Ok(tokio::spawn( + async move { self.run_forever(shutdown).await }, + )) + } + + /// Top-level driver. Races the work loop against the shutdown signal. + /// + /// `biased;` polls the shutdown arm first on every wakeup so a concurrent + /// shutdown wins over an in-flight `run_loop` step. Without `biased`, + /// `select!` would pick randomly between two ready branches and could + /// process one more iteration before shutting down. + async fn run_forever( + self, + shutdown: ShutdownSignal, + ) -> Result { + tokio::select! { + biased; + _ = shutdown.wait_for_shutdown() => Ok(SubmitterExit::Shutdown), + result = self.run_loop() => result, + } + } + + /// Tick → sleep-if-idle → tick. Productive ticks re-enter immediately; + /// idle or transient-error ticks wait `idle_poll_interval`. Fatal errors + /// propagate. + async fn run_loop(&self) -> Result { + loop { + let outcome = match self.tick_once().await { + Ok(o) => o, + Err(BatchSubmitterError::Poster(source)) => { + error!(error = %source, "L1 provider error — will retry"); + TickOutcome::Transient + } + Err(fatal) => return Err(fatal), + }; + match outcome { + TickOutcome::Submitted(_) => continue, + TickOutcome::Idle | TickOutcome::Transient => { + tokio::time::sleep(self.idle_poll_interval).await; + } + } + } + } + + pub(crate) async fn tick_once(&self) -> Result { + let frontier = self.load_frontier().await?; + + // Must start scanning at `safe_block + 1`: after a danger-zone shutdown + // the flusher only returns once `Pending <= Safe`, so any wallet-nonce + // slots backed by blocks at or below the safe head are already + // resolved and folded into `accepted_next_nonce`. Re-scanning those + // blocks here would double-count the finalized prefix. + let recent_observed = self + .poster + .observed_submitted_batch_nonces(frontier.safe_block.saturating_add(1)) + .await?; + + let from_nonce = decide_submit_start(frontier, &recent_observed); + let pending = self.pending_batches(from_nonce).await?; + if pending.is_empty() { + return Ok(TickOutcome::Idle); + } + + for batch in &pending { + debug!( + batch_index = batch.batch_index, + nonce = batch.nonce, + "queueing batch for L1 submission" + ); + } + let submitted_count = pending.len(); + let payloads: Vec> = pending.into_iter().map(|b| b.encoded).collect(); + let tx_hashes = self.poster.submit_batches(payloads).await?; + if tx_hashes.len() != submitted_count { + return Err(BatchSubmitterError::Poster(BatchPosterError::Provider( + format!( + "poster returned {} tx hashes for {submitted_count} submitted batches", + tx_hashes.len(), + ), + ))); + } + + Ok(TickOutcome::Submitted(submitted_count)) + } + + async fn load_frontier(&self) -> Result { + let db_path = self.db_path.clone(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open_read_only(&db_path)?; + storage + .submitter_frontier() + .map_err(BatchSubmitterError::from) + }) + .await + .map_err(|err| BatchSubmitterError::Join(err.to_string()))? + } + + async fn pending_batches( + &self, + min_nonce: u64, + ) -> Result, BatchSubmitterError> { + let db_path = self.db_path.clone(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open_read_only(&db_path)?; + storage + .pending_batches(min_nonce) + .map_err(BatchSubmitterError::from) + }) + .await + .map_err(|err| BatchSubmitterError::Join(err.to_string()))? + } +} + +/// Advance `expected` by greedily consuming any matching observed nonce. +/// +/// `observed_nonces` is the stream of **batch nonces** (from the SSZ payload) +/// decoded from `InputAdded` events sent by our batch-submitter EOA, in L1 +/// event order. Because L1 mines txs from a single EOA in strict wallet-nonce +/// order, this stream is naturally gap-less at the wallet-nonce level: +/// tx[k]'s event cannot appear on-chain without tx[k-1]'s event, and the +/// observed batch nonce sequence therefore mirrors our submission order. +/// +/// Batch nonces themselves (unlike wallet nonces) CAN repeat across recovery +/// generations — e.g., after a cascade, a fresh batch reuses its invalidated +/// predecessor's nonce. That's why we still match on equality rather than +/// trusting a sort: in a post-recovery window, the same batch nonce can be +/// observed twice (once from the invalidated generation, once from the new +/// one), and we only want to advance once. +/// +/// Under the wallet-nonce ordering above, once the next `expected` doesn't +/// appear in the stream the frontier naturally stops advancing — the gap +/// means the scheduler hasn't seen that nonce on-chain yet (or observed it at +/// a different wallet nonce from an earlier generation). +fn advance_expected_batch_nonce( + mut expected: u64, + observed_nonces: impl IntoIterator, +) -> u64 { + for nonce in observed_nonces { + if nonce == expected { + expected = expected.saturating_add(1); + } + } + expected +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use alloy_primitives::Address; + + use super::{TickOutcome, decide_submit_start}; + use crate::l1::submitter::{BatchSubmitterConfig, poster::mock::MockBatchPoster}; + use crate::storage::test_helpers::{TestDb, temp_db}; + use crate::storage::{SafeInputRange, Storage, StoredSafeInput, SubmitterFrontier}; + use sequencer_core::protocol::ProtocolTiming; + + const BATCH_SUBMITTER_ADDRESS: Address = Address::repeat_byte(0x11); + + /// Protocol pinned to `BATCH_SUBMITTER_ADDRESS` — worker tests use that as + /// their test submitter, so populate sees the seeded safe_inputs. + fn submitter_test_protocol() -> ProtocolTiming { + ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + } + } + + fn default_test_config() -> BatchSubmitterConfig { + BatchSubmitterConfig { + idle_poll_interval_ms: 1000, + } + } + + fn seed_two_closed_batches(db_path: &str) { + let mut storage = Storage::open(db_path).expect("open storage"); + storage + .append_safe_inputs(0, &[], BATCH_SUBMITTER_ADDRESS, &submitter_test_protocol()) + .expect("record observed safe head"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let next_safe = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 1"); + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 2"); + } + + fn seed_safe_submitted_batches(db_path: &str, safe_block: u64, nonces: &[u64]) { + let mut storage = Storage::open(db_path).expect("open storage"); + let inputs: Vec<_> = nonces + .iter() + .map(|nonce| StoredSafeInput { + sender: BATCH_SUBMITTER_ADDRESS, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: *nonce, + frames: Vec::new(), + }), + block_number: safe_block, + }) + .collect(); + storage + .append_safe_inputs( + safe_block, + inputs.as_slice(), + BATCH_SUBMITTER_ADDRESS, + &submitter_test_protocol(), + ) + .expect("append safe submitted batches"); + } + + #[tokio::test] + async fn tick_once_submits_first_missing_closed_batch() { + let TestDb { _dir, path } = temp_db("tick-submits"); + seed_two_closed_batches(&path); + + let mock = Arc::new(MockBatchPoster::new()); + let submitter = + super::BatchSubmitter::new(path.clone(), mock.clone(), default_test_config()); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Submitted(3)); + + let submissions = mock.submissions(); + assert_eq!(submissions.len(), 3); + assert_eq!(submissions[0].0, 0); + assert_eq!(submissions[1].0, 1); + assert_eq!(submissions[2].0, 2); + } + + #[tokio::test] + async fn tick_once_submits_nothing_when_already_caught_up() { + let TestDb { _dir, path } = temp_db("tick-caught-up"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0, 1]); + + let mock = Arc::new(MockBatchPoster::new()); + mock.set_observed_submitted_nonces(vec![2]); + let submitter = + super::BatchSubmitter::new(path.clone(), mock.clone(), default_test_config()); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Idle); + assert!(mock.submissions().is_empty()); + assert_eq!(mock.last_from_block(), Some(11)); + } + + #[tokio::test] + async fn tick_once_skips_already_submitted() { + let TestDb { _dir, path } = temp_db("tick-combines-prefix-and-suffix"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0, 1, 2]); + + let mock = Arc::new(MockBatchPoster::new()); + let submitter = + super::BatchSubmitter::new(path.clone(), mock.clone(), default_test_config()); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Idle); + assert!(mock.submissions().is_empty()); + } + + #[tokio::test] + async fn tick_once_submits_only_missing_suffix_from_safe_frontier() { + let TestDb { _dir, path } = temp_db("tick-safe-frontier-suffix"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0, 1]); + + let mock = Arc::new(MockBatchPoster::new()); + let submitter = + super::BatchSubmitter::new(path.clone(), mock.clone(), default_test_config()); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Submitted(1)); + assert_eq!(mock.last_from_block(), Some(11)); + + let submissions = mock.submissions(); + assert_eq!(submissions.len(), 1); + assert_eq!(submissions[0].0, 2); + } + + #[tokio::test] + async fn tick_once_replaces_from_latest_mined_prefix_not_safe_prefix() { + let TestDb { _dir, path } = temp_db("tick-latest-mined-prefix"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0]); + + let mock = Arc::new(MockBatchPoster::new()); + mock.set_observed_submitted_nonces(vec![1]); + let submitter = + super::BatchSubmitter::new(path.clone(), mock.clone(), default_test_config()); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Submitted(1)); + assert_eq!(mock.last_from_block(), Some(11)); + + let submissions = mock.submissions(); + assert_eq!(submissions.len(), 1); + assert_eq!(submissions[0].0, 2); + } + + #[tokio::test] + async fn tick_once_propagates_poster_errors() { + let TestDb { _dir, path } = temp_db("tick-poster-error"); + seed_two_closed_batches(&path); + + let mock = Arc::new(MockBatchPoster::new()); + mock.set_observed_submitted_error(Some("rpc fail")); + let submitter = super::BatchSubmitter::new(path, mock, default_test_config()); + + let err = submitter + .tick_once() + .await + .expect_err("poster error should propagate"); + assert!(matches!(err, super::BatchSubmitterError::Poster(_))); + } + + // ── decide_submit_start (pure) ──────────────────────────────────────── + + #[test] + fn decide_submit_start_advances_past_observed_prefix() { + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 0, + }, + &[0, 1, 2], + ); + assert_eq!(from_nonce, 3); + } + + #[test] + fn decide_submit_start_stops_at_first_gap() { + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 0, + }, + &[0, 2, 3], + ); + assert_eq!(from_nonce, 1); + } + + #[test] + fn decide_submit_start_handles_empty_observed_list() { + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 5, + }, + &[], + ); + assert_eq!(from_nonce, 5); + } + + #[test] + fn decide_submit_start_advances_once_per_matching_nonce_across_recovery_generations() { + // Post-recovery scenario the `advance_expected_batch_nonce` doc calls + // out: batch nonces can repeat across recovery generations because a + // cascade re-uses the last valid ancestor's `nonce + 1`. The observed + // event stream can therefore contain the same batch nonce twice (once + // from the invalidated generation, once from the recovery generation). + // + // decide_submit_start must advance exactly ONCE per matching nonce — + // the second occurrence at a nonce that no longer equals `expected` is + // a no-op, as intended. The underlying fold is table-tested below; this + // pins the wrapper at the nonce-reuse case explicitly. + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 2, + }, + // Two events reporting nonce=2 (one per generation), then nonce=3. + &[2, 2, 3], + ); + // 2 matches expected=2 → advance to 3. Second 2 doesn't match + // expected=3, skip. 3 matches → advance to 4. + assert_eq!(from_nonce, 4); + } + + #[test] + fn advance_expected_batch_nonce_matches_scheduler_nonce_rule() { + assert_eq!(super::advance_expected_batch_nonce(0, Vec::::new()), 0); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 2]), 3); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 3]), 1); + assert_eq!(super::advance_expected_batch_nonce(0, vec![1, 2, 3]), 0); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 1, 2]), 3); + assert_eq!( + super::advance_expected_batch_nonce(0, vec![6, 4, 3, 2, 2, 0, 1]), + 2 + ); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 1]), 2); + assert_eq!(super::advance_expected_batch_nonce(2, vec![2, 3]), 4); + } +} diff --git a/sequencer/src/l2_tx_feed/mod.rs b/sequencer/src/l2_tx_feed/mod.rs deleted file mode 100644 index 7c78a45..0000000 --- a/sequencer/src/l2_tx_feed/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -mod error; -mod feed; - -#[cfg(test)] -mod tests; - -pub use error::{SubscribeError, SubscriptionError}; -pub use feed::{BroadcastTxMessage, L2TxFeed, L2TxFeedConfig, Subscription}; diff --git a/sequencer/src/l2_tx_feed/tests.rs b/sequencer/src/l2_tx_feed/tests.rs deleted file mode 100644 index d93c8ed..0000000 --- a/sequencer/src/l2_tx_feed/tests.rs +++ /dev/null @@ -1,215 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::time::{Duration, SystemTime}; - -use alloy_primitives::{Address, Signature}; -use tempfile::TempDir; -use tokio::sync::oneshot; - -use super::{BroadcastTxMessage, L2TxFeed, L2TxFeedConfig, SubscribeError}; -use crate::inclusion_lane::{PendingUserOp, SequencerError}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; -use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; -use sequencer_core::user_op::UserOp; - -#[test] -fn broadcast_user_op_serializes_with_hex_data() { - let msg = BroadcastTxMessage::from_offset_and_tx( - 7, - SequencedL2Tx::UserOp(ValidUserOp { - sender: Address::from_slice(&[0x11; 20]), - fee: 3, - data: vec![0xaa, 0xbb], - }), - ); - let json = serde_json::to_string(&msg).expect("serialize"); - assert!(json.contains("\"kind\":\"user_op\"")); - assert!(json.contains("\"offset\":7")); - assert!(json.contains("\"fee\":3")); - assert!(json.contains("\"data\":\"0xaabb\"")); -} - -#[test] -fn broadcast_direct_input_serializes_with_hex_payload() { - let msg = BroadcastTxMessage::from_offset_and_tx( - 9, - SequencedL2Tx::Direct(DirectInput { - sender: Address::ZERO, - block_number: 42, - payload: vec![0xcc, 0xdd], - }), - ); - let json = serde_json::to_string(&msg).expect("serialize"); - assert!(json.contains("\"kind\":\"direct_input\"")); - assert!(json.contains("\"offset\":9")); - assert!(json.contains("\"sender\":\"0x0000000000000000000000000000000000000000\"")); - assert!(json.contains("\"block_number\":42")); - assert!(json.contains("\"payload\":\"0xccdd\"")); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn subscribe_from_rejects_catchup_window() { - let db = test_db("catchup-window"); - seed_ordered_txs(db.path.as_str()); - let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); - - let result = feed.subscribe_from(0, 1); - - assert!(matches!( - result, - Err(SubscribeError::CatchUpWindowExceeded { - requested_offset: 0, - live_start_offset: 2, - max_catchup_events: 1, - }) - )); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn subscription_replays_existing_rows_in_order() { - let db = test_db("replay-existing"); - seed_ordered_txs(db.path.as_str()); - let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); - - let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); - - let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait first event") - .expect("first event"); - let second = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait second event") - .expect("second event"); - - assert_eq!(first.offset(), 0); - assert_eq!(second.offset(), 1); - - subscription.finish().await.expect("finish subscription"); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn subscription_filters_batch_submitter_safe_inputs() { - let db = test_db("filters-batch-submitter-inputs"); - let batch_submitter_address = Address::from([0xfe; 20]); - seed_ordered_txs_with_sender(db.path.as_str(), batch_submitter_address); - let feed = L2TxFeed::new( - db.path.clone(), - ShutdownSignal::default(), - L2TxFeedConfig { - idle_poll_interval: Duration::from_millis(2), - page_size: 64, - batch_submitter_address: Some(batch_submitter_address), - }, - ); - - let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); - let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait first event") - .expect("first event"); - - assert!(matches!( - first, - BroadcastTxMessage::UserOp { offset: 0, .. } - )); - - let no_second = tokio::time::timeout(Duration::from_millis(50), subscription.recv()).await; - assert!( - no_second.is_err(), - "filtered batch-submitter input should not be broadcast" - ); - - subscription.finish().await.expect("finish subscription"); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn shutdown_signal_closes_subscription() { - let db = test_db("shutdown-closes"); - seed_ordered_txs(db.path.as_str()); - let shutdown = ShutdownSignal::default(); - let feed = test_feed(db.path.as_str(), shutdown.clone()); - - let mut subscription = feed.subscribe_from(u64::MAX, u64::MAX).expect("subscribe"); - - shutdown.request_shutdown(); - - assert!( - tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait for subscription close") - .is_none() - ); - subscription.finish().await.expect("clean shutdown"); -} - -fn test_feed(db_path: &str, shutdown: ShutdownSignal) -> L2TxFeed { - L2TxFeed::new( - db_path.to_string(), - shutdown, - L2TxFeedConfig { - idle_poll_interval: Duration::from_millis(2), - page_size: 64, - batch_submitter_address: None, - }, - ) -} - -fn test_db(label: &str) -> TestDb { - let dir = TempDir::new().expect("create temp dir"); - let path = dir.path().join(format!("{label}.db")); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } -} - -fn seed_ordered_txs(db_path: &str) { - seed_ordered_txs_with_sender(db_path, Address::ZERO); -} - -fn seed_ordered_txs_with_sender(db_path: &str, direct_sender: Address) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - let (respond_to, _recv) = oneshot::channel::>(); - let pending = PendingUserOp { - signed: sequencer_core::user_op::SignedUserOp { - sender: Address::from_slice(&[0x11; 20]), - signature: Signature::test_signature(), - user_op: UserOp { - nonce: 0, - max_fee: 3, - data: vec![0x42].into(), - }, - }, - respond_to, - received_at: SystemTime::now(), - }; - - storage - .append_user_ops_chunk(&mut head, &[pending]) - .expect("append user-op chunk"); - storage - .append_safe_inputs( - 10, - &[StoredSafeInput { - sender: direct_sender, - payload: vec![0xaa], - block_number: 10, - }], - ) - .expect("append direct input"); - storage - .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) - .expect("close frame with one drained direct input"); -} - -struct TestDb { - _dir: TempDir, - path: String, -} diff --git a/sequencer/src/lib.rs b/sequencer/src/lib.rs index edb38c0..a40c98d 100644 --- a/sequencer/src/lib.rs +++ b/sequencer/src/lib.rs @@ -3,19 +3,27 @@ //! Sequencer prototype focused on deterministic inclusion and replay. //! -//! Flow: API -> inclusion lane -> SQLite -> catch-up replay. -//! The inclusion lane is the single writer that defines execution order. -pub mod api; -pub mod batch_submitter; -pub mod config; -pub mod inclusion_lane; -pub mod input_reader; -pub mod l2_tx_feed; -pub mod partition; -pub mod provider; -mod runtime; -pub mod shutdown; +//! Top-level layout follows the system's data flow: +//! +//! - `ingress` — submit API + inclusion lane (write path from external clients) +//! - `egress` — subscribe API + L2-tx feed (read path to internal indexers) +//! - `l1` — input reader, batch submitter, L1 helpers +//! - `storage` — SQLite-backed persistence (organized by writer role) +//! - `recovery` — cascade invalidation + recovery batch +//! - `runtime` — orchestration, config, shutdown +//! - `http` — shared HTTP error type + axum::serve orchestration +//! +//! The inclusion lane is the single writer of open-batch state; this is the +//! invariant the storage layer relies on. + +pub mod egress; +pub mod http; +pub mod ingress; +pub mod l1; +pub mod recovery; +pub mod runtime; pub mod storage; -pub use config::RunConfig; +pub use http::{ApiConfig, ApiError, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; +pub use runtime::config::RunConfig; pub use runtime::{RunError, run}; diff --git a/sequencer/src/provider.rs b/sequencer/src/provider.rs deleted file mode 100644 index 40789d5..0000000 --- a/sequencer/src/provider.rs +++ /dev/null @@ -1,56 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::str::FromStr; -use std::time::Duration; - -use alloy::{ - providers::{DynProvider, Provider, ProviderBuilder}, - rpc::client::RpcClient, - signers::local::PrivateKeySigner, - transports::http::{Http, reqwest, reqwest::Url}, -}; -use alloy_transport::layers::RetryBackoffLayer; - -const REQUEST_TIMEOUT: Duration = Duration::from_secs(20); -const MAX_RATE_LIMIT_RETRIES: u32 = 5; -const INITIAL_BACKOFF_MS: u64 = 200; -const COMPUTE_UNITS_PER_SEC: u64 = 500; - -fn create_client(url: &str) -> Result { - let url = Url::parse(url).map_err(|e| format!("invalid RPC URL: {e}"))?; - - let http_client = reqwest::Client::builder() - .timeout(REQUEST_TIMEOUT) - .build() - .map_err(|e| format!("failed to build HTTP client: {e}"))?; - - let transport = Http::with_client(http_client, url); - let is_local = transport.guess_local(); - - let retry = RetryBackoffLayer::new( - MAX_RATE_LIMIT_RETRIES, - INITIAL_BACKOFF_MS, - COMPUTE_UNITS_PER_SEC, - ); - - Ok(RpcClient::builder() - .layer(retry) - .transport(transport, is_local)) -} - -/// Create a read-only provider with retry and timeout. -pub fn create_provider(url: &str) -> Result { - let client = create_client(url)?; - let provider = ProviderBuilder::new().connect_client(client); - Ok(provider.erased()) -} - -/// Create a provider with a wallet signer, retry, and timeout. -pub fn create_signer_provider(url: &str, private_key: &str) -> Result { - let client = create_client(url)?; - let signer = - PrivateKeySigner::from_str(private_key).map_err(|e| format!("invalid private key: {e}"))?; - let provider = ProviderBuilder::new().wallet(signer).connect_client(client); - Ok(provider.erased()) -} diff --git a/sequencer/src/recovery/detector.rs b/sequencer/src/recovery/detector.rs new file mode 100644 index 0000000..c9a5f11 --- /dev/null +++ b/sequencer/src/recovery/detector.rs @@ -0,0 +1,325 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Runtime danger detector. +//! +//! A tiny background task that, every `poll_interval`, asks [`Storage::check_danger`] +//! whether recovery or refusal is needed. If so, the task exits with +//! [`DetectorExit::RecoveryRequired`] — the runtime turns that into a +//! deliberate non-error process shutdown, the orchestrator respawns, and +//! `run_preemptive_recovery` takes over on startup. +//! +//! This is its own worker (not part of the batch submitter) because the two +//! concerns are orthogonal: the submitter makes progress on L1, which involves +//! slow confirmations; the detector just reads the DB + wall clock at a fixed +//! cadence. Keeping them separate means one never delays the other, and each +//! stays a ~20-line state machine. +//! +//! Detection is eventually consistent with the input reader: a transition into +//! danger may lag by up to one `poll_interval`. The preemptive margin absorbs +//! this bounded lag. + +use std::time::Duration; + +use thiserror::Error; +use tracing::debug; + +use crate::runtime::clock::unix_now_ms; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::{DangerStatus, Storage, StorageOpenError}; +use sequencer_core::protocol::ProtocolTiming; + +/// How the detector's loop exited. +/// +/// `RecoveryRequired` is a *deliberate* exit — not an error. The runtime maps +/// it to a distinct `RunError` variant so operators can tell "time to recover +/// or refuse startup" apart from "something crashed". +#[derive(Debug)] +pub enum DetectorExit { + /// Shutdown signal fired before any danger was detected. + Shutdown, + /// A non-safe danger status was observed. Stop and let startup dispatch + /// the recovery/refusal path from a fresh read. + RecoveryRequired { status: DangerStatus }, +} + +#[derive(Debug, Error)] +pub enum DangerDetectorError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("danger detector join error: {0}")] + Join(String), +} + +pub struct DangerDetector { + db_path: String, + protocol: ProtocolTiming, + poll_interval: Duration, +} + +impl DangerDetector { + pub fn new( + db_path: impl Into, + protocol: ProtocolTiming, + poll_interval: Duration, + ) -> Self { + Self { + db_path: db_path.into(), + protocol, + poll_interval, + } + } + + /// Spawn the detector loop. The `shutdown` signal is what the loop + /// respects; passing it at start time (instead of construction time) keeps + /// the construction phase pure. + pub fn start( + self, + shutdown: ShutdownSignal, + ) -> Result>, StorageOpenError> + { + let _ = Storage::open_read_only(self.db_path.as_str())?; + Ok(tokio::spawn( + async move { self.run_forever(shutdown).await }, + )) + } + + /// Top-level driver. Races the work loop against the shutdown signal. + /// + /// `biased;` polls the shutdown arm first on every wakeup so a concurrent + /// shutdown wins over an in-flight `run_loop` step. Without `biased`, + /// `select!` would pick randomly between two ready branches and could + /// process one more iteration before shutting down. + async fn run_forever( + self, + shutdown: ShutdownSignal, + ) -> Result { + tokio::select! { + biased; + _ = shutdown.wait_for_shutdown() => Ok(DetectorExit::Shutdown), + result = self.run_loop() => result, + } + } + + /// Tick → sleep → tick. Returns `RecoveryRequired` when a non-Safe danger + /// status fires. Shutdown is handled by the outer `run_forever` select, + /// so this loop has no shutdown concerns. + async fn run_loop(self) -> Result { + loop { + match self.check_once().await? { + DangerStatus::Safe => { + debug!("danger check: safe"); + } + status => { + // All non-Safe variants exit for recovery/refusal. The + // dispatch difference (flush vs no-flush vs refuse) + // only matters at the next startup — `decide_startup_action` + // re-runs `check_danger` and routes based on which variant + // fires this time. + tracing::error!( + ?status, + danger_threshold = self.protocol.danger_threshold(), + l1_read_stale_after_blocks = self.protocol.l1_read_stale_after_blocks, + "danger detected — triggering shutdown for startup recovery" + ); + return Ok(DetectorExit::RecoveryRequired { status }); + } + } + tokio::time::sleep(self.poll_interval).await; + } + } + + async fn check_once(&self) -> Result { + let db_path = self.db_path.clone(); + let protocol = self.protocol; + let now_ms = unix_now_ms(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open_read_only(&db_path)?; + storage + .check_danger(&protocol, now_ms) + .map_err(DangerDetectorError::from) + }) + .await + .map_err(|err| DangerDetectorError::Join(err.to_string()))? + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::test_helpers::{SENDER_A, temp_db}; + use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; + use std::time::Duration; + + fn test_protocol() -> ProtocolTiming { + ProtocolTiming { + max_wait_blocks: 1200, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + } + } + + fn make_stale_batch_payload(nonce: u64, safe_block: u64) -> Vec { + ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block, + fee_price: 0, + }], + }) + } + + #[tokio::test] + async fn exits_on_shutdown_when_safe() { + let db = temp_db("detector-shutdown"); + let mut storage = Storage::open(&db.path).expect("open storage"); + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .append_safe_inputs(10, &[], SENDER_A, &test_protocol()) + .expect("record fresh safe-head observation"); + drop(storage); + + let shutdown = ShutdownSignal::default(); + let detector = + DangerDetector::new(db.path.clone(), test_protocol(), Duration::from_millis(50)); + let handle = detector.start(shutdown.clone()).expect("start detector"); + + tokio::time::sleep(Duration::from_millis(20)).await; + shutdown.request_shutdown(); + let exit = tokio::time::timeout(Duration::from_secs(2), handle) + .await + .expect("detector exits within timeout") + .expect("join") + .expect("detector result"); + assert!(matches!(exit, DetectorExit::Shutdown)); + } + + #[tokio::test] + async fn exits_with_recovery_required_when_observed_closed_check_fires() { + // Closed frontier batch is aged past `danger_threshold` against the + // observed safe block — the closed-batch arm of `check_danger` trips. + let db = temp_db("detector-observed-closed-danger"); + let mut storage = Storage::open(&db.path).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 1"); + + let protocol = test_protocol(); + storage + .append_safe_inputs( + 1135, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &protocol, + ) + .expect("append"); + drop(storage); + + let shutdown = ShutdownSignal::default(); + let detector = DangerDetector::new(db.path.clone(), protocol, Duration::from_millis(50)); + let handle = detector.start(shutdown).expect("start detector"); + + let exit = tokio::time::timeout(Duration::from_secs(2), handle) + .await + .expect("detector exits within timeout") + .expect("join") + .expect("detector result"); + match exit { + DetectorExit::RecoveryRequired { status } => { + assert_eq!(status, DangerStatus::ClosedBatchInDanger(1)); + } + other => panic!("expected recovery-required exit, got {other:?}"), + } + } + + #[tokio::test] + async fn exits_with_recovery_required_when_wall_clock_fallback_fires() { + // Safe head appears frozen — observed block-based checks wouldn't trip + // (ages look fine against the last observed safe block), but the + // wall-clock-adjusted check infers extended L1 silence and lowers the + // effective threshold. + // + // The detector treats observed and estimated danger identically (both + // exit for startup recovery), but the estimated path goes through + // `wall_clock_adjusted_danger_threshold` + // — a completely separate code path that deserves its own test. + let db = temp_db("detector-estimated-danger"); + let mut storage = Storage::open(&db.path).expect("open storage"); + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + let protocol = test_protocol(); + storage + .append_safe_inputs( + 1200, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 100), + block_number: 200, + }], + SENDER_A, + &protocol, + ) + .expect("append accepted batch 0"); + + // Observed check: batch 1's first_frame_safe_block = 100, current + // safe = 1200. age = 1100 < danger_threshold (1125), so observed + // closed-batch danger would NOT fire. + // + // Rewind synced_at_ms by 25 blocks' worth of wall-clock time so the + // wall-clock arm shaves 25 off the threshold (1125 → 1100). At 1100, + // batch 1's age = 1100 trips `>=`. Estimated batch danger fires. + let now_ms = crate::runtime::clock::unix_now_ms(); + drop(storage); + let rewind_conn = + Storage::open_connection(&db.path).expect("open raw connection to rewind synced_at_ms"); + rewind_conn + .execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 WHERE singleton_id = 0", + [i64::try_from(now_ms.saturating_sub(25 * 12 * 1000)).unwrap_or(i64::MAX)], + ) + .expect("rewind safe-progress timestamp"); + drop(rewind_conn); + + let shutdown = ShutdownSignal::default(); + let detector = DangerDetector::new(db.path.clone(), protocol, Duration::from_millis(50)); + let handle = detector.start(shutdown).expect("start detector"); + + let exit = tokio::time::timeout(Duration::from_secs(2), handle) + .await + .expect("detector exits within timeout") + .expect("join") + .expect("detector result"); + match exit { + DetectorExit::RecoveryRequired { status } => { + assert_eq!(status, DangerStatus::EstimatedBatchInDanger(1)); + } + other => { + panic!("expected recovery-required exit from wall-clock fallback, got {other:?}") + } + } + } +} diff --git a/sequencer/src/recovery/flusher.rs b/sequencer/src/recovery/flusher.rs new file mode 100644 index 0000000..f4fb226 --- /dev/null +++ b/sequencer/src/recovery/flusher.rs @@ -0,0 +1,682 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Mempool flusher: submits no-op transactions to resolve pending wallet-nonce slots +//! before recovery runs. +//! +//! After a danger-zone detection, the sequencer goes offline and calls +//! [`MempoolFlusher::flush_and_wait`] to ensure every `w_nonce` slot is consumed +//! (either by its original batch transaction or by a replacement no-op). Once all +//! slots reach safe finality, the recovery procedure can read fully-finalized L1 state. + +use alloy::network::TransactionBuilder; +use alloy::providers::{ + DynProvider, PendingTransactionConfig, PendingTransactionError, Provider, WatchTxError, +}; +use alloy::rpc::types::BlockNumberOrTag; +use alloy_primitives::{Address, B256, U256}; +use std::time::Duration; +use thiserror::Error; +use tracing::{debug, error, info}; + +#[derive(Debug, Error)] +pub enum FlushError { + #[error("provider/transport: {0}")] + Provider(String), +} + +pub struct MempoolFlusher { + provider: DynProvider, + address: Address, + confirmation_timeout: Duration, + safe_poll_interval: Duration, +} + +/// Derive the flusher's watch/poll durations from the configured block time. +/// +/// `confirmation_timeout` is 10 blocks — long enough to survive one-off L1 +/// stalls but short enough to retry within a reasonable window. +/// `safe_poll_interval` is one block — matches the natural cadence for +/// `get_transaction_count(Safe)` to advance. +/// +/// H6 regression: both values must scale with `SEQ_SECONDS_PER_BLOCK`; a fixed +/// 12s assumption would mis-pace on non-mainnet chains. +fn derive_timeouts(seconds_per_block: u64) -> (Duration, Duration) { + ( + Duration::from_secs(10 * seconds_per_block), + Duration::from_secs(seconds_per_block), + ) +} + +/// Bump current 1559 fee estimates so flush no-ops are competitive with +/// pending batch transactions at the same wallet nonces. +/// +/// Safety does not depend on the no-op winning. Either the original batch tx +/// or the no-op can consume the slot; `flush_and_wait` only returns once +/// `Pending <= Safe`. These bumped fees are an operational acceleration, not a +/// correctness precondition. The `+ 1` on `max_fee` avoids integer-rounding +/// flat spots, and the priority doubling is intentionally generous. +fn bumped_replacement_fees(base_max_fee: u128, base_priority_fee: u128) -> (u128, u128) { + let new_max_fee = base_max_fee.saturating_mul(11) / 10 + 1; + let new_priority_fee = base_priority_fee.saturating_mul(2).max(1); + (new_max_fee, new_priority_fee) +} + +fn send_failures_error(failures: &[(u64, String)]) -> FlushError { + const MAX_SAMPLES: usize = 3; + + let samples = failures + .iter() + .take(MAX_SAMPLES) + .map(|(nonce, message)| format!("nonce {nonce}: {message}")) + .collect::>() + .join("; "); + let remaining = failures.len().saturating_sub(MAX_SAMPLES); + let suffix = if remaining == 0 { + String::new() + } else { + format!("; ... and {remaining} more") + }; + + FlushError::Provider(format!( + "failed to submit {} flush no-op transaction(s): {samples}{suffix}", + failures.len() + )) +} + +fn map_watch_error(err: PendingTransactionError) -> Result { + match err { + PendingTransactionError::TxWatcher(WatchTxError::Timeout) => Ok(false), + other => Err(FlushError::Provider(other.to_string())), + } +} + +impl MempoolFlusher { + pub fn new(provider: DynProvider, address: Address, seconds_per_block: u64) -> Self { + let (confirmation_timeout, safe_poll_interval) = derive_timeouts(seconds_per_block); + Self { + provider, + address, + confirmation_timeout, + safe_poll_interval, + } + } + + /// Flush the mempool by submitting no-op transactions for pending nonce + /// slots, then waiting until every slot is safe. + /// + /// The loop runs until `get_transaction_count(Pending) <= get_transaction_count(Safe)`, + /// meaning every slot has reached safe finality. + /// + /// At each iteration: + /// 1. Submit 0-ETH self-transfers for nonces between `Latest` and `Pending`. + /// These compete with any batch transactions still in the mempool. If + /// an original batch wins, that is also success: the slot advanced. + /// 2. Watch each submitted no-op for L1 inclusion. + /// 3. Sleep to let the safe head advance, then re-check the loop condition. + /// 4. If any watch times out, retry the outer loop (tx may have been dropped, + /// or the original batch may be making progress instead). + pub async fn flush_and_wait(&self) -> Result<(), FlushError> { + let mut attempt = 0u32; + loop { + let safe_nonce = self.nonce_at(BlockNumberOrTag::Safe).await?; + let pending_nonce = self.nonce_at(BlockNumberOrTag::Pending).await?; + + if pending_nonce <= safe_nonce { + info!( + safe_nonce, + "mempool flush complete — all slots reached safe finality" + ); + return Ok(()); + } + + let unresolved = pending_nonce - safe_nonce; + + if attempt == 0 { + info!( + safe_nonce, + pending_nonce, + unresolved, + "flushing mempool: submitting no-ops for unresolved w_nonce slots" + ); + } else { + // Retry after a previous timeout — re-print status so operators + // see the current state without scrolling back. + error!( + attempt, + safe_nonce, + pending_nonce, + unresolved, + "flush retry: previous attempt timed out, resubmitting" + ); + } + attempt += 1; + + // Submit no-ops for nonces between Latest and Pending. We submit + // the full range before watching any tx, so every unresolved slot + // gets a competing no-op attempt in this pass. + let latest_nonce = self.nonce_at(BlockNumberOrTag::Latest).await?; + let tx_hashes = self.submit_noops(latest_nonce, pending_nonce).await?; + + // Watch each submitted tx for L1 inclusion. + if !self.watch_txs(&tx_hashes).await? { + continue; + } + + // Sleep to let the safe head catch up before re-checking. + tokio::time::sleep(self.safe_poll_interval).await; + } + } + + /// Submit 0-ETH self-transfers for nonces `from_nonce..to_nonce`. + /// Returns the tx hashes of successfully submitted transactions. + async fn submit_noops(&self, from_nonce: u64, to_nonce: u64) -> Result, FlushError> { + if from_nonce >= to_nonce { + return Ok(Vec::new()); + } + + let fees = self + .provider + .estimate_eip1559_fees() + .await + .map_err(|e| FlushError::Provider(e.to_string()))?; + + let (bumped_max_fee, bumped_priority_fee) = + bumped_replacement_fees(fees.max_fee_per_gas, fees.max_priority_fee_per_gas); + + debug!( + from_nonce, + to_nonce, + count = to_nonce - from_nonce, + max_fee_per_gas = bumped_max_fee, + max_priority_fee = bumped_priority_fee, + "submitting flush no-ops" + ); + + let mut tx_hashes = Vec::new(); + let mut send_failures = Vec::new(); + for nonce in from_nonce..to_nonce { + let tx = alloy::rpc::types::TransactionRequest::default() + .with_to(self.address) + .with_value(U256::ZERO) + .with_nonce(nonce) + .with_max_fee_per_gas(bumped_max_fee) + .with_max_priority_fee_per_gas(bumped_priority_fee); + + match self.provider.send_transaction(tx).await { + Ok(pending) => { + let tx_hash = *pending.tx_hash(); + debug!(nonce, %tx_hash, "flush no-op submitted"); + tx_hashes.push(tx_hash); + } + Err(e) => { + let message = e.to_string(); + error!(nonce, error = %message, "flush no-op send failed"); + send_failures.push((nonce, message)); + } + } + } + + if !send_failures.is_empty() { + return Err(send_failures_error(send_failures.as_slice())); + } + + Ok(tx_hashes) + } + + /// Watch submitted transactions for L1 inclusion. + /// Uses the same `PendingTransactionConfig::watch` pattern as the batch poster. + /// Returns `true` if all txs confirmed, `false` on timeout. + async fn watch_txs(&self, tx_hashes: &[B256]) -> Result { + for tx_hash in tx_hashes { + let watch = PendingTransactionConfig::new(*tx_hash) + .with_required_confirmations(1) + .with_timeout(Some(self.confirmation_timeout)) + .with_provider(self.provider.root().clone()); + match watch.watch().await { + Ok(_) => { + debug!(%tx_hash, "flush no-op included on L1"); + } + Err(err @ PendingTransactionError::TxWatcher(WatchTxError::Timeout)) => { + // This should not happen during normal L1 operation. + // Possible causes: L1 congestion, tx dropped from mempool, + // gas price too low to compete. + error!( + %tx_hash, + timeout_secs = self.confirmation_timeout.as_secs(), + "flush no-op timed out waiting for L1 inclusion — will retry" + ); + return map_watch_error(err); + } + Err(err) => return map_watch_error(err), + } + } + Ok(true) + } + + async fn nonce_at(&self, block: BlockNumberOrTag) -> Result { + self.provider + .get_transaction_count(self.address) + .block_id(block.into()) + .await + .map_err(|e| FlushError::Provider(e.to_string())) + } +} + +#[cfg(test)] +impl MempoolFlusher { + fn with_timeouts( + mut self, + confirmation_timeout: Duration, + safe_poll_interval: Duration, + ) -> Self { + self.confirmation_timeout = confirmation_timeout; + self.safe_poll_interval = safe_poll_interval; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + use alloy::network::TransactionBuilder; + use alloy::node_bindings::Anvil; + use alloy::providers::Provider; + + // ── H5: replacement-fee bump keeps no-ops competitive ───────── + + #[test] + fn replacement_fee_bump_exceeds_ten_percent_for_max_fee() { + // `max_fee_per_gas` must strictly exceed base by ≥10% for any positive base. + for base in [1_u128, 10, 100, 1_000, 1_000_000, 1_000_000_000_000] { + let (new_max, _) = bumped_replacement_fees(base, 0); + assert!( + new_max.saturating_mul(10) >= base.saturating_mul(11), + "max_fee bump violates ≥10% rule: base={base}, new={new_max}", + ); + } + } + + #[test] + fn replacement_fee_bump_doubles_priority_fee() { + // `priority_fee` doubles (200%), easily clearing the 10% replacement threshold. + for base in [1_u128, 10, 1_000, 1_000_000_000] { + let (_, new_prio) = bumped_replacement_fees(0, base); + assert_eq!(new_prio, base.saturating_mul(2)); + assert!( + new_prio.saturating_mul(10) >= base.saturating_mul(11), + "priority bump violates ≥10% rule: base={base}, new={new_prio}", + ); + } + } + + #[test] + fn replacement_fee_floor_is_positive_even_when_base_is_zero() { + // If the estimator returns zero, bumped values are still positive so the + // tx is actually broadcast rather than rejected by the node. + let (new_max, new_prio) = bumped_replacement_fees(0, 0); + assert!(new_max >= 1); + assert!(new_prio >= 1); + } + + #[test] + fn send_failure_error_summarizes_failed_slots() { + let err = send_failures_error(&[ + (7, "nonce too low".to_string()), + (8, "replacement transaction underpriced".to_string()), + (9, "insufficient funds".to_string()), + (10, "fee cap less than block base fee".to_string()), + ]); + + let message = err.to_string(); + assert!(message.contains("failed to submit 4 flush no-op transaction(s)")); + assert!(message.contains("nonce 7: nonce too low")); + assert!(message.contains("nonce 8: replacement transaction underpriced")); + assert!(message.contains("nonce 9: insufficient funds")); + assert!(message.contains("and 1 more")); + assert!(!message.contains("nonce 10")); + } + + #[test] + fn watch_error_mapping_retries_only_timeouts() { + let timeout = map_watch_error(PendingTransactionError::TxWatcher(WatchTxError::Timeout)) + .expect("timeout should be a retryable watch result"); + assert!(!timeout, "timeout should ask the caller to retry"); + + let err = map_watch_error(PendingTransactionError::FailedToRegister) + .expect_err("non-timeout watcher failures must surface"); + assert!(matches!(err, FlushError::Provider(_))); + } + + #[test] + fn replacement_fee_bump_saturates_at_u128_max() { + // Overflow safety: astronomical base fees must not wrap around. + let (new_max, new_prio) = bumped_replacement_fees(u128::MAX, u128::MAX); + assert_eq!(new_max, u128::MAX / 10 + 1); + assert_eq!(new_prio, u128::MAX); + } + + // ── H6: timeouts derive from seconds_per_block ──────────────── + + #[test] + fn timeouts_derive_from_seconds_per_block() { + assert_eq!( + derive_timeouts(12), + (Duration::from_secs(120), Duration::from_secs(12)), + "mainnet 12s block: 120s confirmation, 12s poll", + ); + assert_eq!( + derive_timeouts(2), + (Duration::from_secs(20), Duration::from_secs(2)), + "fast L2 2s block: scaled proportionally", + ); + assert_eq!( + derive_timeouts(1), + (Duration::from_secs(10), Duration::from_secs(1)), + "minimum accepted block time (H8: SEQ_SECONDS_PER_BLOCK >= 1)", + ); + } + + #[test] + fn confirmation_timeout_is_ten_times_safe_poll_interval() { + // Structural invariant: confirmation window == 10 × poll interval. + for spb in [1_u64, 2, 5, 12, 30] { + let (conf, poll) = derive_timeouts(spb); + assert_eq!(conf, poll * 10); + } + } + + /// Verify that `anvil` is available. Panics with a clear message if not found. + fn require_anvil() { + assert!( + std::process::Command::new("anvil") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(), + "anvil not found on PATH — install Foundry (https://getfoundry.sh)" + ); + } + + /// Spawn Anvil with manual mining and fast safe-finality (2 slots/epoch). + fn spawn_anvil() -> alloy::node_bindings::AnvilInstance { + Anvil::default() + .arg("--no-mining") + .arg("--slots-in-an-epoch") + .arg("2") + .timeout(30_000) + .spawn() + } + + /// Create a signer provider from an Anvil private key. + fn signer_provider(anvil: &alloy::node_bindings::AnvilInstance) -> DynProvider { + let key_hex = alloy_primitives::hex::encode(anvil.first_key().to_bytes()); + crate::l1::provider::create_signer_provider( + anvil.endpoint_url().as_str(), + &format!("0x{key_hex}"), + ) + .expect("create signer provider") + } + + /// Mine blocks at a fixed interval until the token is dropped. + fn start_miner(provider: DynProvider, interval: Duration) -> tokio::sync::oneshot::Sender<()> { + let (stop_tx, mut stop_rx) = tokio::sync::oneshot::channel(); + tokio::spawn(async move { + loop { + tokio::select! { + _ = &mut stop_rx => break, + _ = tokio::time::sleep(interval) => { + let _ = provider.raw_request::<_, serde_json::Value>( + "evm_mine".into(), ()).await; + } + } + } + }); + stop_tx + } + + /// Send a 0-ETH self-transfer at a specific nonce (without waiting for inclusion). + async fn send_tx_at_nonce(provider: &DynProvider, addr: Address, nonce: u64) { + let fees = provider + .estimate_eip1559_fees() + .await + .expect("estimate fees"); + let tx = alloy::rpc::types::TransactionRequest::default() + .with_to(addr) + .with_value(U256::ZERO) + .with_nonce(nonce) + .with_max_fee_per_gas(fees.max_fee_per_gas) + .with_max_priority_fee_per_gas(fees.max_priority_fee_per_gas); + let _ = provider.send_transaction(tx).await.expect("send tx"); + } + + #[tokio::test] + async fn flush_is_noop_when_no_pending_nonces() { + require_anvil(); + + let anvil = spawn_anvil(); + let provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Mine a few blocks so safe head advances past genesis. + for _ in 0..4 { + let _: serde_json::Value = provider + .raw_request("evm_mine".into(), ()) + .await + .expect("mine"); + } + + let flusher = MempoolFlusher::new(provider, addr, 12); + // No pending txs — should return immediately. + flusher.flush_and_wait().await.expect("flush"); + } + + #[tokio::test] + async fn flush_resolves_pending_nonces_to_safe() { + require_anvil(); + + let anvil = spawn_anvil(); + let provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Send 3 txs into the mempool (unmined). + for nonce in 0..3 { + send_tx_at_nonce(&provider, addr, nonce).await; + } + + // Verify: pending=3, safe=0. + let pending = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Pending.into()) + .await + .expect("pending nonce"); + assert_eq!(pending, 3); + + let safe = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce"); + assert_eq!(safe, 0); + + // Start a background miner so blocks are produced. + let _miner = start_miner(provider.clone(), Duration::from_millis(100)); + + // Run the flusher — it should resolve all 3 nonces to safe. + let flusher = MempoolFlusher::new(provider.clone(), addr, 12) + .with_timeouts(Duration::from_secs(5), Duration::from_millis(200)); + tokio::time::timeout(Duration::from_secs(10), flusher.flush_and_wait()) + .await + .expect("flush should complete within timeout") + .expect("flush should succeed"); + + // Verify: safe nonce caught up. + let safe_after = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce after flush"); + assert!( + safe_after >= 3, + "safe nonce should be >= 3 after flush, got {safe_after}" + ); + } + + #[tokio::test] + async fn flush_handles_already_mined_but_not_safe() { + require_anvil(); + + let anvil = spawn_anvil(); + let provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Send 2 txs and mine them (latest but not safe). + for nonce in 0..2 { + send_tx_at_nonce(&provider, addr, nonce).await; + } + let _: serde_json::Value = provider + .raw_request("evm_mine".into(), ()) + .await + .expect("mine"); + + let latest = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Latest.into()) + .await + .expect("latest nonce"); + assert_eq!(latest, 2, "txs should be mined"); + + let safe = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce"); + assert_eq!(safe, 0, "txs should not be safe yet"); + + // Start miner to advance safe head. + let _miner = start_miner(provider.clone(), Duration::from_millis(100)); + + // Flusher should wait for safe finality (no new txs to submit). + let flusher = MempoolFlusher::new(provider.clone(), addr, 12) + .with_timeouts(Duration::from_secs(5), Duration::from_millis(200)); + tokio::time::timeout(Duration::from_secs(10), flusher.flush_and_wait()) + .await + .expect("flush should complete within timeout") + .expect("flush should succeed"); + + let safe_after = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce after flush"); + assert!( + safe_after >= 2, + "safe nonce should be >= 2 after flush, got {safe_after}" + ); + } + + // ── flusher under extended provider outage ────────────────────────── + // + // Implementation note (matters for what this test pins): `flush_and_wait` + // does NOT retry internally on `Provider` errors — a failed `nonce_at` + // call propagates via `?` and the function returns. "Retry forever" is + // really the orchestrator's restart loop: on each respawn a fresh flusher + // is constructed and tried, and this repeats until the provider becomes + // reachable again. The e2e suite covers that orchestrator-loop story via + // `respawn_until_stable`. + // + // This test pins the two ends of that contract: (a) a mid-flush + // disconnect surfaces as `FlushError::Provider` fast (no hang, no + // internal retry), and (b) a fresh flusher call after reconnect + // completes and consumes the pending wallet-nonce slot. + + #[tokio::test] + async fn flush_surfaces_provider_error_under_disconnect_and_completes_on_reconnect() { + use rollups_harness::TcpProxy; + + require_anvil(); + + let anvil = spawn_anvil(); + // Direct-to-Anvil provider: the test uses this to seed pending + // mempool state and inspect the chain. Bypasses the proxy so the + // seeding itself isn't affected by disconnect. + let direct_provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Proxy in front of Anvil — this is what the flusher dials. Anvil's + // endpoint uses `localhost` which the proxy's upstream parser rejects + // (it expects a literal IP). Swap for `127.0.0.1` so `parse` accepts. + let anvil_upstream = anvil.endpoint().replace("localhost", "127.0.0.1"); + let proxy = TcpProxy::spawn(anvil_upstream.as_str()) + .await + .expect("spawn proxy"); + + let key_hex = alloy_primitives::hex::encode(anvil.first_key().to_bytes()); + let proxied_provider = crate::l1::provider::create_signer_provider( + proxy.endpoint().as_str(), + &format!("0x{key_hex}"), + ) + .expect("create signer provider through proxy"); + + // Seed: submit a tx at wallet-nonce 0 into Anvil's mempool (auto- + // mining is off, so it stays pending). The flusher now has work. + send_tx_at_nonce(&direct_provider, addr, 0).await; + let pending = direct_provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Pending.into()) + .await + .expect("pending nonce"); + assert_eq!(pending, 1, "seed tx should be pending"); + + // Disconnect the proxy. The flusher's provider can no longer reach + // Anvil — any RPC call sees a torn-down TCP connection. + proxy.disconnect(); + let flusher = MempoolFlusher::new(proxied_provider.clone(), addr, 12) + .with_timeouts(Duration::from_secs(2), Duration::from_millis(200)); + + // `flush_and_wait` must fail fast (no internal retry loop). Wrap in + // a generous outer timeout just to bound test flakiness if alloy's + // HTTP client has small internal retries. + let err = tokio::time::timeout(Duration::from_secs(5), flusher.flush_and_wait()) + .await + .expect("flush_and_wait must not hang under disconnect") + .expect_err("flush_and_wait must surface a Provider error under disconnect"); + assert!( + matches!(err, FlushError::Provider(_)), + "expected FlushError::Provider, got: {err:?}", + ); + + // Reconnect the proxy + start mining so the flusher can make forward + // progress. This models the orchestrator's next respawn succeeding + // after the provider returns. + proxy.reconnect(); + let _miner = start_miner(direct_provider.clone(), Duration::from_millis(100)); + + // A fresh flusher (a respawn would build a new one from scratch). + // It should now read nonces, replace the pending tx with a bumped- + // fee no-op (or let the original land), wait for safe, and return. + let flusher_after = MempoolFlusher::new(proxied_provider, addr, 12) + .with_timeouts(Duration::from_secs(5), Duration::from_millis(200)); + tokio::time::timeout(Duration::from_secs(15), flusher_after.flush_and_wait()) + .await + .expect("flush_and_wait should complete after reconnect") + .expect("flush should succeed once the provider is reachable"); + + // Forward progress: the nonce-0 slot was consumed (either by the + // flusher's no-op or by the original tx landing). `safe_nonce` is + // >= 1 only if something at nonce 0 reached safe finality — proof + // the flusher completed its job end-to-end. + let safe_after = direct_provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce after flush"); + assert!( + safe_after >= 1, + "nonce-0 slot must be consumed and safe after flush, got {safe_after}", + ); + + proxy.shutdown().await.expect("proxy shutdown"); + } +} diff --git a/sequencer/src/recovery/mod.rs b/sequencer/src/recovery/mod.rs new file mode 100644 index 0000000..fcbcad8 --- /dev/null +++ b/sequencer/src/recovery/mod.rs @@ -0,0 +1,417 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Preemptive recovery: detect danger zone, then recover via Tip invalidation +//! or post-flush cascade. +//! +//! At runtime a dedicated [`DangerDetector`] worker polls `Storage::check_danger` each +//! tick. If the L1 view is stale, a closed batch or Tip crosses +//! `danger_threshold`, or the batch-relative wall-clock estimate fires during +//! an L1 outage, the detector exits with `DetectorExit::RecoveryRequired`, the +//! runtime maps that to `DangerDetectorExit::DangerDetected` under +//! `RunError::Worker`, and the process exits. The detector tripping is *only* +//! a trigger to enter startup recovery/refusal — it doesn't make the cascade +//! decision. External orchestration restarts the sequencer, and this startup +//! path runs. +//! +//! Startup recovery branches on [`decide_startup_action`]: +//! +//! - `FlushAndCascade`: a closed batch past gold is dangerous. Flush the mempool, +//! re-sync the safe head, then call [`Storage::recover_post_flush`] which cascades +//! everything past the gold frontier (every non-gold batch is doomed: +//! Silver-stale, Silver-poisoned, or Pending no-op'd). If all closed are gold, +//! falls through to a Tip danger-zone check — see `docs/recovery/README.md` Step 5. +//! - `RecoverTip`: only the open Tip is dangerous. It has no L1 footprint, so call +//! [`Storage::recover_aging_tip`] directly without flushing. +//! - `Proceed`: no danger detected. No DB writes; the lane handles genesis init +//! via [`Storage::initialize_open_state`] if the DB is fresh. +//! - `Refuse`: L1 view is stale or batch-relative estimated danger fired; bail +//! out and surface to the operator. +//! +//! ## Fault model +//! +//! Recovery is designed to handle **submission and outage failures**: the sequencer +//! crashes, the L1 provider becomes unreachable, transactions are dropped from the +//! mempool, or the process is offline for an extended period. It is **not** designed +//! to handle arbitrarily malformed self-submissions. The scheduler frontier +//! reconstruction (`populate_safe_accepted_batches`) trusts that on-chain batches +//! from the sequencer's own address are structurally valid. This is a deliberate +//! system assumption, not a gap — the sequencer controls its own submissions. +//! +//! See `docs/recovery/` for the full design, TLA+ specs, and design history. + +mod detector; +mod flusher; + +use thiserror::Error; + +use crate::l1::reader::{InputReader, InputReaderError}; +use crate::runtime::config::L1Config; +use crate::storage::{self, DangerStatus, StorageOpenError}; +pub use detector::{DangerDetector, DangerDetectorError, DetectorExit}; +pub use flusher::MempoolFlusher; +use sequencer_core::protocol::ProtocolTiming; + +#[derive(Debug, Error)] +pub enum RecoveryError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("flush: {0}")] + Flush(#[from] flusher::FlushError), + #[error("input reader: {0}")] + InputReader(#[from] InputReaderError), + #[error("provider: {0}")] + Provider(String), + #[error("startup refused: {0:?}")] + Refuse(RefuseReason), +} + +/// Why startup cannot proceed safely. +/// +/// Each variant captures a DB/L1-view state that makes recovery or normal +/// startup unsafe. The operator sees the variant in logs and must intervene. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RefuseReason { + /// The L1 safe block timestamp is too old or unknown, so the local L1 view + /// is not usable for recovery or continued soft confirmations. + L1ViewStale, + /// Batch-relative wall-clock estimation says this batch consumed its + /// remaining runway, but the observed safe block has not crossed danger. + /// Refuse rather than recover from estimated state. + EstimatedBatchInDanger { batch_index: u64 }, +} + +/// What a fresh startup must do, given the current danger state. +/// +/// Pure function output — no side effects. The `run_preemptive_recovery` +/// driver executes the chosen action. +/// +/// The four non-Refuse variants encode the recovery split: +/// +/// - `Proceed`: no danger detected. No recovery work needed; the lane handles +/// genesis init on first start. +/// - `RecoverTip`: aging Tip, no closed batch in danger. The Tip has no L1 +/// footprint, so we cascade it directly with no flush. +/// - `FlushAndCascade`: closed batch in danger. We need a flush to resolve +/// its L1 transaction's fate before the cascade decision. +/// - `Refuse`: can't proceed safely; surface to the operator. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StartupAction { + /// No danger; no DB writes (lane handles genesis init). + Proceed, + /// Open Tip is past `danger_threshold` and no closed batch is in danger. + /// No flush needed (Tip has no L1 slot to resolve); cascade the Tip + /// directly. + RecoverTip { batch_index: u64 }, + /// Closed batch past the gold frontier is in danger. Flush the mempool, + /// re-sync, then run the post-flush cascade. + FlushAndCascade { batch_index: u64 }, + /// Can't proceed safely; return the reason and let the operator decide. + Refuse(RefuseReason), +} + +impl StartupAction { + fn label(self) -> &'static str { + match self { + StartupAction::Proceed => "proceed", + StartupAction::RecoverTip { .. } => "recover_tip", + StartupAction::FlushAndCascade { .. } => "flush_and_cascade", + StartupAction::Refuse(_) => "refuse", + } + } +} + +fn danger_status_label(danger: DangerStatus) -> &'static str { + match danger { + DangerStatus::Safe => "safe", + DangerStatus::L1ViewStale => "l1_view_stale", + DangerStatus::ClosedBatchInDanger(_) => "closed_batch_in_danger", + DangerStatus::TipInDanger(_) => "tip_in_danger", + DangerStatus::EstimatedBatchInDanger(_) => "estimated_batch_in_danger", + } +} + +fn danger_batch_index(danger: DangerStatus) -> Option { + match danger { + DangerStatus::ClosedBatchInDanger(batch_index) + | DangerStatus::TipInDanger(batch_index) + | DangerStatus::EstimatedBatchInDanger(batch_index) => Some(batch_index), + DangerStatus::Safe | DangerStatus::L1ViewStale => None, + } +} + +fn refuse_reason_label(reason: RefuseReason) -> &'static str { + match reason { + RefuseReason::L1ViewStale => "l1_view_stale", + RefuseReason::EstimatedBatchInDanger { .. } => "estimated_batch_in_danger", + } +} + +/// Pure decision: given the danger status, return what startup should do. L1 +/// reachability is an execution concern: if `FlushAndCascade` cannot reach L1, +/// the flusher returns an error and the orchestrator retries. +pub fn decide_startup_action(danger: DangerStatus) -> StartupAction { + match danger { + DangerStatus::Safe => StartupAction::Proceed, + DangerStatus::ClosedBatchInDanger(batch_index) => { + StartupAction::FlushAndCascade { batch_index } + } + DangerStatus::TipInDanger(batch_index) => StartupAction::RecoverTip { batch_index }, + DangerStatus::L1ViewStale => StartupAction::Refuse(RefuseReason::L1ViewStale), + DangerStatus::EstimatedBatchInDanger(batch_index) => { + StartupAction::Refuse(RefuseReason::EstimatedBatchInDanger { batch_index }) + } + } +} + +/// Run the full preemptive recovery procedure at startup. +/// +/// 1. Try to sync the safe head from L1. If L1 is unreachable, continue with +/// the persisted view; whether that view is fresh enough is decided by +/// `check_danger` in step 2 — a stale persisted view returns +/// `L1ViewStale` and step 3 refuses. +/// 2. Consult [`decide_startup_action`] to pick what to do. +/// 3. If the decision is `FlushAndCascade`: flush the mempool, re-sync, then +/// continue. If `Refuse`: bail out and let the orchestrator retry. +/// 4. Run the atomic recovery transaction (cascade stale batches if any, +/// always re-open the Tip if missing). +/// +/// Returns the list of invalidated batch indices (empty if no stale batches). +pub async fn run_preemptive_recovery( + db_path: &str, + input_reader: &mut InputReader, + l1_config: &L1Config, + protocol: &ProtocolTiming, +) -> Result, RecoveryError> { + // ── Step 1: Sync safe head (tolerate L1 failure) ─────────────── + // + // `sync_to_current_safe_head` goes through `append_safe_inputs`, which + // maintains `safe_accepted_batches` atomically with each advance. After + // a successful sync, the scheduler-frontier view is consistent with + // l1_safe_head for every downstream reader. + let l1_reachable = match input_reader.sync_to_current_safe_head().await { + Ok(()) => { + tracing::info!("L1 safe head synced"); + true + } + Err(e) => { + let InputReaderError::Provider(error) = e else { + return Err(RecoveryError::InputReader(e)); + }; + tracing::error!(error = %error, "L1 unreachable during startup safe-head sync"); + false + } + }; + + // ── Step 2: Read danger and decide action ───────────────────── + let danger = { + let mut storage = storage::Storage::open(db_path)?; + storage.check_danger(protocol, crate::runtime::clock::unix_now_ms())? + }; + let action = decide_startup_action(danger); + tracing::info!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + l1_reachable, + danger_threshold = protocol.danger_threshold(), + max_wait_blocks = protocol.max_wait_blocks, + l1_read_stale_after_blocks = protocol.l1_read_stale_after_blocks, + "startup recovery decision" + ); + + // ── Step 3: Execute decision ─────────────────────────────────── + // + // The three non-Refuse paths split the recovery work: + // + // - `Proceed`: no DB writes. A `Proceed` decision means no batch is in + // danger and the persisted state is fine as-is. Closed batches past + // gold (if any) stay in their natural lifecycle. + // + // - `RecoverTip`: no flush. Only the open Tip crossed `danger_threshold`; + // it has no L1 slot to resolve, so it can be invalidated directly. + // + // - `FlushAndCascade`: flush resolves every wallet-nonce slot, then + // re-sync brings the gold frontier to its maximum extent. After that + // point, *everything past gold is doomed* (Silver-stale, + // Silver-poisoned, or Pending-killed — see `Storage::recover_post_flush` + // docs). Cascade unconditionally from the first non-gold. + let invalidated = match action { + StartupAction::Proceed => { + tracing::info!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + "no danger zone detected — proceeding without recovery" + ); + // No Tip-creation here. The only production scenario where the DB + // has no Tip is genesis (fresh DB, never started). The lane handles + // that via `initialize_open_state`, which it skips when a Tip + // exists. All other Tip-mutating paths (lane rotation, recovery + // cascade) are wrapped in single rusqlite transactions and cannot + // leave the DB Tip-less. Code between this point and the lane's + // first init is construction-only and does not read the Tip. + Vec::new() + } + StartupAction::RecoverTip { batch_index } => { + tracing::error!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + tip_batch_index = batch_index, + danger_threshold = protocol.danger_threshold(), + "open Tip in danger zone — invalidating and opening fresh Tip (no flush)" + ); + let mut storage = storage::Storage::open(db_path)?; + storage.recover_aging_tip(protocol.danger_threshold())? + } + StartupAction::FlushAndCascade { batch_index } => { + tracing::error!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + batch_index, + danger_threshold = protocol.danger_threshold(), + max_wait_blocks = protocol.max_wait_blocks, + "closed batch in danger zone — entering preemptive recovery (flush + cascade)" + ); + run_flush_and_cascade(db_path, input_reader, l1_config, protocol).await? + } + StartupAction::Refuse(reason) => { + tracing::error!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + ?reason, + refuse_reason = refuse_reason_label(reason), + l1_reachable, + "startup refused: cannot recover safely" + ); + return Err(RecoveryError::Refuse(reason)); + } + }; + + if invalidated.is_empty() { + tracing::info!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + invalidated_count = 0, + "startup recovery complete — no batches invalidated" + ); + } else { + // Successful self-heal: the system invalidated the doomed suffix and + // opened a recovery batch as designed. The upstream "danger detected" + // log already alerted the operator at error level; this completes + // that incident with a non-error outcome. + tracing::warn!( + danger_status = danger_status_label(danger), + danger_batch_index = ?danger_batch_index(danger), + startup_action = action.label(), + invalidated_count = invalidated.len(), + batches = ?invalidated, + "startup recovery complete — batches invalidated and recovery batch opened" + ); + } + + Ok(invalidated) +} + +/// Execute the flush-and-cascade phase: resolve every pending wallet-nonce +/// slot on L1, re-sync the safe head so the gold frontier reflects post-flush +/// state, then cascade-invalidate the doomed non-gold suffix and open a fresh +/// recovery Tip. +/// +/// The four steps form one logical phase — they have no meaning on their own +/// and the orchestrator only ever runs them as a unit. +async fn run_flush_and_cascade( + db_path: &str, + input_reader: &mut InputReader, + l1_config: &L1Config, + protocol: &ProtocolTiming, +) -> Result, RecoveryError> { + let flush_provider = crate::l1::provider::create_signer_provider( + &l1_config.eth_rpc_url, + &l1_config.batch_submitter_private_key, + ) + .map_err(|e| RecoveryError::Provider(e.to_string()))?; + let flusher = MempoolFlusher::new( + flush_provider, + l1_config.batch_submitter_address, + protocol.seconds_per_block, + ); + flusher.flush_and_wait().await?; + + // If this re-sync errors out, L1 has been flushed but the DB has NOT been + // cascaded — we exit with the InputReaderError and rely on the orchestrator + // to respawn. That's safe by design: + // + // - `flush_and_wait` is idempotent: on the next attempt it queries L1 for + // pending wallet-nonces, finds zero (the previous flush cleared them), + // and returns immediately. + // - `check_danger` is stable across the failure window: safe_block only + // moves forward and flush doesn't retroactively change closed batches' + // `first_frame_safe_block`, so the danger condition that fired before + // still fires after the restart. + // - `recover_post_flush` is idempotent against the resulting DB state + // (verified by `after_post_recovery_crash_is_no_op` in `recovery_tests`). + // + // So a failure here just costs an extra orchestrator respawn; correctness + // is preserved. + // + // More importantly, it refuses to boot, during a recovery scenario, when + // we can't reach L1. + tracing::info!("re-syncing L1 safe head after flush"); + input_reader.sync_to_current_safe_head().await?; + + tracing::info!("running post-flush recovery (cascade non-gold suffix)"); + let mut storage = storage::Storage::open(db_path)?; + Ok(storage.recover_post_flush(protocol.danger_threshold())?) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn proceed_on_safe() { + assert_eq!( + decide_startup_action(DangerStatus::Safe), + StartupAction::Proceed + ); + } + + #[test] + fn flush_and_cascade_on_closed_batch_in_danger() { + assert_eq!( + decide_startup_action(DangerStatus::ClosedBatchInDanger(42)), + StartupAction::FlushAndCascade { batch_index: 42 } + ); + } + + #[test] + fn refuse_on_l1_view_stale() { + assert_eq!( + decide_startup_action(DangerStatus::L1ViewStale), + StartupAction::Refuse(RefuseReason::L1ViewStale) + ); + } + + #[test] + fn refuse_on_estimated_batch_in_danger() { + assert_eq!( + decide_startup_action(DangerStatus::EstimatedBatchInDanger(7)), + StartupAction::Refuse(RefuseReason::EstimatedBatchInDanger { batch_index: 7 }) + ); + } + + #[test] + fn recover_tip_in_danger() { + assert_eq!( + decide_startup_action(DangerStatus::TipInDanger(11)), + StartupAction::RecoverTip { batch_index: 11 } + ); + } +} diff --git a/sequencer/src/runtime.rs b/sequencer/src/runtime.rs deleted file mode 100644 index 7016b78..0000000 --- a/sequencer/src/runtime.rs +++ /dev/null @@ -1,420 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use thiserror::Error; -use tracing::warn; - -use crate::api::{self, ApiConfig}; -use crate::batch_submitter::{BatchPosterConfig, EthereumBatchPoster}; -use crate::batch_submitter::{BatchSubmitter, BatchSubmitterConfig, BatchSubmitterError}; -use crate::config::{L1Config, RunConfig}; -use crate::inclusion_lane::{InclusionLane, InclusionLaneConfig, InclusionLaneError}; -use crate::input_reader::{InputReader, InputReaderConfig, InputReaderError}; -use crate::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{self, StorageOpenError}; -use sequencer_core::application::Application; - -const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; -const QUEUE_CAPACITY: usize = 8192; -const INPUT_READER_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(2); - -#[derive(Debug, Error)] -pub enum RunError { - #[error(transparent)] - OpenStorage(#[from] StorageOpenError), - #[error(transparent)] - Io(#[from] std::io::Error), - #[error("server stopped unexpectedly")] - ServerStoppedUnexpectedly, - #[error("server join error: {source}")] - ServerJoin { - #[source] - source: tokio::task::JoinError, - }, - #[error("inclusion lane stopped unexpectedly")] - InclusionLaneStoppedUnexpectedly, - #[error("inclusion lane exited: {source}")] - InclusionLane { - #[source] - source: InclusionLaneError, - }, - #[error("inclusion lane join error: {source}")] - InclusionLaneJoin { - #[source] - source: tokio::task::JoinError, - }, - #[error("input reader stopped unexpectedly")] - InputReaderStoppedUnexpectedly, - #[error("input reader exited: {source}")] - InputReader { - #[source] - source: InputReaderError, - }, - #[error("input reader join error: {source}")] - InputReaderJoin { - #[source] - source: tokio::task::JoinError, - }, - #[error("batch submitter stopped unexpectedly")] - BatchSubmitterStoppedUnexpectedly, - #[error("batch submitter exited: {source}")] - BatchSubmitter { - #[source] - source: BatchSubmitterError, - }, - #[error("batch submitter join error: {source}")] - BatchSubmitterJoin { - #[source] - source: tokio::task::JoinError, - }, -} - -enum FirstExit { - Signal(Option), - Server(RunError), - InclusionLane(RunError), - InputReader(RunError), - BatchSubmitter(RunError), -} - -pub async fn run(app: A, config: RunConfig) -> Result<(), RunError> -where - A: Application + 'static, -{ - let domain = config.build_domain(); - let shutdown = ShutdownSignal::default(); - - // Ensure the data directory exists before any component tries to open the DB. - std::fs::create_dir_all(&config.data_dir)?; - let db_path = config.db_path(); - - // Single L1/InputBox config shared by input reader and batch submitter (no duplicate RPC URL or addresses). - let batch_submitter_private_key = config.resolve_private_key()?; - - let batch_submitter_address = { - use alloy::signers::local::PrivateKeySigner; - use std::str::FromStr; - PrivateKeySigner::from_str(&batch_submitter_private_key) - .map_err(|e| RunError::Io(std::io::Error::other(e.to_string())))? - .address() - }; - let mut input_reader = InputReader::new( - db_path.clone(), - shutdown.clone(), - InputReaderConfig { - rpc_url: config.eth_rpc_url.clone(), - app_address: config.app_address, - poll_interval: INPUT_READER_POLL_INTERVAL, - long_block_range_error_codes: config.long_block_range_error_codes.clone(), - }, - ) - .await - .map_err(|source| RunError::InputReader { source })?; - let input_reader_genesis_block = input_reader.genesis_block(); - let l1_config = L1Config { - eth_rpc_url: config.eth_rpc_url.clone(), - input_box_address: input_reader.input_box_address(), - app_address: config.app_address, - batch_submitter_private_key, - batch_submitter_address, - }; - input_reader - .sync_to_current_safe_head() - .await - .map_err(|source| RunError::InputReader { source })?; - - tracing::info!( - http_addr = %config.http_addr, - data_dir = %config.data_dir, - eth_rpc_url = %l1_config.eth_rpc_url, - input_box_address = %l1_config.input_box_address, - input_reader_genesis_block, - chain_id = config.chain_id, - app_address = %l1_config.app_address, - "starting sequencer" - ); - - let storage = storage::Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; - let (tx, mut inclusion_lane_handle) = InclusionLane::start( - QUEUE_CAPACITY, - shutdown.clone(), - app, - storage, - InclusionLaneConfig::new(l1_config.batch_submitter_address), - ); - let mut input_reader_handle = input_reader.start()?; - - // Batch submitter uses the same L1 config (InputBox address and RPC URL) as the input reader. - let batch_submitter_config = BatchSubmitterConfig { - idle_poll_interval_ms: config.batch_submitter_idle_poll_interval_ms, - }; - let poster_config = BatchPosterConfig { - l1_submit_address: l1_config.input_box_address, - app_address: l1_config.app_address, - batch_submitter_address: l1_config.batch_submitter_address, - start_block: input_reader_genesis_block, - confirmation_depth: config.batch_submitter_confirmation_depth, - long_block_range_error_codes: config.long_block_range_error_codes, - }; - let provider = build_batch_submitter_provider(&l1_config)?; - - // Validate that the RPC chain ID matches --chain-id. - use alloy::providers::Provider; - let rpc_chain_id = provider - .get_chain_id() - .await - .map_err(|e| std::io::Error::other(format!("failed to query RPC chain ID: {e}")))?; - assert_eq!( - rpc_chain_id, config.chain_id, - "RPC chain ID {rpc_chain_id} does not match --chain-id {}", - config.chain_id - ); - - let poster = std::sync::Arc::new(EthereumBatchPoster::new(provider, poster_config)); - let submitter = BatchSubmitter::new( - db_path.clone(), - l1_config.batch_submitter_address, - poster, - shutdown.clone(), - batch_submitter_config, - ); - let mut batch_submitter_handle = submitter.start().map_err(RunError::OpenStorage)?; - - let tx_feed = L2TxFeed::new( - db_path.clone(), - shutdown.clone(), - L2TxFeedConfig { - batch_submitter_address: Some(l1_config.batch_submitter_address), - ..L2TxFeedConfig::default() - }, - ); - - let mut server_task = api::start( - &config.http_addr, - tx, - domain, - A::MAX_METHOD_PAYLOAD_BYTES, - shutdown.clone(), - tx_feed, - ApiConfig::default(), - ) - .await?; - - tracing::info!(address = %config.http_addr, "listening"); - - let shutdown_signal = tokio::signal::ctrl_c(); - tokio::pin!(shutdown_signal); - - let first_exit = tokio::select! { - signal_result = &mut shutdown_signal => { - FirstExit::Signal(signal_result.err().map(RunError::from)) - } - server_result = &mut server_task => { - FirstExit::Server(map_server_exit(server_result)) - } - lane_result = &mut inclusion_lane_handle => { - FirstExit::InclusionLane(map_lane_exit(lane_result)) - } - reader_result = &mut input_reader_handle => { - FirstExit::InputReader(map_input_reader_exit(reader_result)) - } - submitter_result = &mut batch_submitter_handle => { - FirstExit::BatchSubmitter(map_batch_submitter_exit(submitter_result)) - } - }; - - begin_runtime_shutdown(&shutdown); - finish_runtime( - first_exit, - server_task, - inclusion_lane_handle, - input_reader_handle, - batch_submitter_handle, - ) - .await -} - -fn begin_runtime_shutdown(shutdown: &ShutdownSignal) { - shutdown.request_shutdown(); -} - -async fn wait_for_clean_shutdown( - server_task: tokio::task::JoinHandle>, - inclusion_lane_handle: tokio::task::JoinHandle>, - input_reader_handle: tokio::task::JoinHandle>, - batch_submitter_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - wait_for_server_shutdown(server_task).await?; - wait_for_lane_shutdown(inclusion_lane_handle).await?; - wait_for_input_reader_shutdown(input_reader_handle).await?; - wait_for_batch_submitter_shutdown(batch_submitter_handle).await?; - Ok(()) -} - -async fn finish_runtime( - first_exit: FirstExit, - server_task: tokio::task::JoinHandle>, - inclusion_lane_handle: tokio::task::JoinHandle>, - input_reader_handle: tokio::task::JoinHandle>, - batch_submitter_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match first_exit { - FirstExit::Signal(signal_error) => { - let shutdown_result = wait_for_clean_shutdown( - server_task, - inclusion_lane_handle, - input_reader_handle, - batch_submitter_handle, - ) - .await; - match (signal_error, shutdown_result) { - (Some(err), _) => Err(err), - (None, Ok(())) => Ok(()), - (None, Err(err)) => Err(err), - } - } - FirstExit::Server(primary) => { - log_cleanup_result( - "inclusion lane", - wait_for_lane_shutdown(inclusion_lane_handle).await, - ); - log_cleanup_result( - "input reader", - wait_for_input_reader_shutdown(input_reader_handle).await, - ); - log_cleanup_result( - "batch submitter", - wait_for_batch_submitter_shutdown(batch_submitter_handle).await, - ); - Err(primary) - } - FirstExit::InclusionLane(primary) => { - log_cleanup_result("server", wait_for_server_shutdown(server_task).await); - log_cleanup_result( - "input reader", - wait_for_input_reader_shutdown(input_reader_handle).await, - ); - log_cleanup_result( - "batch submitter", - wait_for_batch_submitter_shutdown(batch_submitter_handle).await, - ); - Err(primary) - } - FirstExit::InputReader(primary) => { - log_cleanup_result("server", wait_for_server_shutdown(server_task).await); - log_cleanup_result( - "inclusion lane", - wait_for_lane_shutdown(inclusion_lane_handle).await, - ); - log_cleanup_result( - "batch submitter", - wait_for_batch_submitter_shutdown(batch_submitter_handle).await, - ); - Err(primary) - } - FirstExit::BatchSubmitter(primary) => { - log_cleanup_result("server", wait_for_server_shutdown(server_task).await); - log_cleanup_result( - "inclusion lane", - wait_for_lane_shutdown(inclusion_lane_handle).await, - ); - log_cleanup_result( - "input reader", - wait_for_input_reader_shutdown(input_reader_handle).await, - ); - Err(primary) - } - } -} - -async fn wait_for_server_shutdown( - server_task: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match server_task.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::Io(source)), - Err(source) => Err(RunError::ServerJoin { source }), - } -} - -async fn wait_for_lane_shutdown( - inclusion_lane_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match inclusion_lane_handle.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::InclusionLane { source }), - Err(source) => Err(RunError::InclusionLaneJoin { source }), - } -} - -async fn wait_for_input_reader_shutdown( - input_reader_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match input_reader_handle.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::InputReader { source }), - Err(source) => Err(RunError::InputReaderJoin { source }), - } -} - -async fn wait_for_batch_submitter_shutdown( - batch_submitter_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match batch_submitter_handle.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::BatchSubmitter { source }), - Err(source) => Err(RunError::BatchSubmitterJoin { source }), - } -} - -fn map_server_exit(result: Result, tokio::task::JoinError>) -> RunError { - match result { - Ok(Ok(())) => RunError::ServerStoppedUnexpectedly, - Ok(Err(source)) => RunError::Io(source), - Err(source) => RunError::ServerJoin { source }, - } -} - -fn map_lane_exit( - result: Result, tokio::task::JoinError>, -) -> RunError { - match result { - Ok(Ok(())) => RunError::InclusionLaneStoppedUnexpectedly, - Ok(Err(source)) => RunError::InclusionLane { source }, - Err(source) => RunError::InclusionLaneJoin { source }, - } -} - -fn map_input_reader_exit( - result: Result, tokio::task::JoinError>, -) -> RunError { - match result { - Ok(Ok(())) => RunError::InputReaderStoppedUnexpectedly, - Ok(Err(source)) => RunError::InputReader { source }, - Err(source) => RunError::InputReaderJoin { source }, - } -} - -fn map_batch_submitter_exit( - result: Result, tokio::task::JoinError>, -) -> RunError { - match result { - Ok(Ok(())) => RunError::BatchSubmitterStoppedUnexpectedly, - Ok(Err(source)) => RunError::BatchSubmitter { source }, - Err(source) => RunError::BatchSubmitterJoin { source }, - } -} - -fn log_cleanup_result(component: &str, result: Result<(), RunError>) { - if let Err(err) = result { - warn!(component, error = %err, "component shutdown after primary failure also errored"); - } -} - -fn build_batch_submitter_provider( - l1: &L1Config, -) -> Result { - crate::provider::create_signer_provider(&l1.eth_rpc_url, &l1.batch_submitter_private_key) - .map_err(std::io::Error::other) -} diff --git a/sequencer/src/runtime/clock.rs b/sequencer/src/runtime/clock.rs new file mode 100644 index 0000000..a0874e8 --- /dev/null +++ b/sequencer/src/runtime/clock.rs @@ -0,0 +1,19 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared clock helper. +//! +//! Every callsite that needs "now in Unix-ms" goes through [`unix_now_ms`] so +//! the sequencer has a single place to swap in a test clock if needed. +//! `SystemTime::now()` pre-epoch is defended against via `unwrap_or_default()`. + +use std::time::SystemTime; + +/// Current wall-clock time as Unix-ms. Passed into +/// [`crate::storage::Storage::check_danger`] and friends. +pub fn unix_now_ms() -> u64 { + SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 +} diff --git a/sequencer/src/config.rs b/sequencer/src/runtime/config.rs similarity index 54% rename from sequencer/src/config.rs rename to sequencer/src/runtime/config.rs index 1e4e1ad..ade3097 100644 --- a/sequencer/src/config.rs +++ b/sequencer/src/runtime/config.rs @@ -1,12 +1,10 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) -use alloy_primitives::{Address, U256}; +use alloy_primitives::Address; use alloy_sol_types::Eip712Domain; use clap::{ArgGroup, Parser}; - -pub const DOMAIN_NAME: &str = "CartesiAppSequencer"; -pub const DOMAIN_VERSION: &str = "1"; +use sequencer_core::protocol::{ProtocolTiming, ProtocolTimingError}; const DEFAULT_HTTP_ADDR: &str = "127.0.0.1:3000"; const DEFAULT_DATA_DIR: &str = "sequencer-data"; @@ -61,7 +59,7 @@ pub struct RunConfig { #[arg(long, env = "SEQ_ETH_RPC_URL", value_parser = parse_non_empty_string)] pub eth_rpc_url: String, /// Error codes that trigger `get_logs` retries with a shorter block range. - #[arg(long, env = "SEQ_LONG_BLOCK_RANGE_ERROR_CODES", value_delimiter = ',', default_values = crate::partition::DEFAULT_LONG_BLOCK_RANGE_ERROR_CODES)] + #[arg(long, env = "SEQ_LONG_BLOCK_RANGE_ERROR_CODES", value_delimiter = ',', default_values = crate::l1::partition::DEFAULT_LONG_BLOCK_RANGE_ERROR_CODES)] pub long_block_range_error_codes: Vec, /// Expected chain ID. Validated against the RPC at startup. #[arg(long, env = "SEQ_CHAIN_ID")] @@ -92,24 +90,56 @@ pub struct RunConfig { )] pub batch_submitter_idle_poll_interval_ms: u64, - /// Number of blocks behind Latest that the batch submitter treats as confirmed. + /// Additional confirmations to wait for after a batch-submission tx is included on L1. #[arg( long, env = "SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH", - default_value = "0" + default_value = "2" )] pub batch_submitter_confirmation_depth: u64, + + /// Blocks before MAX_WAIT_BLOCKS to trigger preemptive recovery. + /// The danger threshold is MAX_WAIT_BLOCKS minus this margin. + /// Must be less than MAX_WAIT_BLOCKS (validated at startup). + /// + /// Default 300 (~1h at 12s/block) is sized to give operators meaningful + /// runway to investigate before the system gives up on the current + /// batches — see `docs/recovery/README.md` "Step 1: Danger threshold" + /// for the rationale. + #[arg(long, env = "SEQ_PREEMPTIVE_MARGIN_BLOCKS", default_value = "300")] + pub preemptive_margin_blocks: u64, + + /// Blocks of safe-head age after which the L1 read view is considered too + /// stale to trust. Independent of the preemptive margin — a separate + /// concern ("how old is the cached L1 view before we stop trusting it" vs. + /// "how much runway before write-side recovery trips"). Must be strictly + /// less than the danger threshold (validated at startup). + /// + /// Default 600 (~2h at 12s/block). + #[arg(long, env = "SEQ_L1_READ_STALE_AFTER_BLOCKS", default_value = "600", value_parser = clap::value_parser!(u64).range(1..))] + pub l1_read_stale_after_blocks: u64, + + /// Assumed L1 block time in seconds. Used to estimate block progression from + /// wall-clock time when the L1 provider is unreachable. + #[arg(long, env = "SEQ_SECONDS_PER_BLOCK", default_value = "12", value_parser = clap::value_parser!(u64).range(1..))] + pub seconds_per_block: u64, } impl RunConfig { pub fn build_domain(&self) -> Eip712Domain { - Eip712Domain { - name: Some(DOMAIN_NAME.into()), - version: Some(DOMAIN_VERSION.into()), - chain_id: Some(U256::from(self.chain_id)), - verifying_contract: Some(self.app_address), - salt: None, - } + sequencer_core::build_input_domain(self.chain_id, self.app_address) + } + + /// Build a validated [`ProtocolTiming`] from this config's tuning fields. + /// Pure derivation — does not touch I/O. `max_wait_blocks` is the shared + /// scheduler constant; the rest come from the operator-tunable CLI args. + pub fn protocol_timing(&self) -> Result { + ProtocolTiming::try_new( + sequencer_core::MAX_WAIT_BLOCKS, + self.preemptive_margin_blocks, + self.l1_read_stale_after_blocks, + self.seconds_per_block, + ) } /// Full path to the SQLite database file inside `data_dir`. @@ -157,9 +187,10 @@ fn parse_address(raw: &str) -> Result { #[cfg(test)] mod tests { - use super::{DOMAIN_NAME, DOMAIN_VERSION, RunConfig}; + use super::RunConfig; use alloy_primitives::{Address, U256}; use clap::Parser; + use sequencer_core::{DOMAIN_NAME, DOMAIN_VERSION}; const TEST_ARGS: [&str; 9] = [ "sequencer", @@ -203,6 +234,13 @@ mod tests { ); } + #[test] + fn run_config_defaults_batch_submitter_confirmation_depth_to_two() { + let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); + + assert_eq!(config.batch_submitter_confirmation_depth, 2); + } + #[test] fn run_config_builds_domain_with_fixed_name_and_version() { let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); @@ -216,4 +254,82 @@ mod tests { Some(Address::from_slice(&[0x11; 20])) ); } + + // ── H8 regression: SEQ_SECONDS_PER_BLOCK=0 is rejected by clap ── + // + // The H8 hardening added `value_parser = clap::value_parser!(u64).range(1..)` + // on `seconds_per_block` to prevent a divide-by-zero panic in the + // wall-clock fallback (`elapsed_secs / seconds_per_block`). Without the + // value parser, an operator typo would panic the process during the worst + // possible moment — an L1 outage. These tests lock the clap-level guard. + + fn args_with_seconds_per_block(value: &str) -> Vec<&str> { + let mut args: Vec<&str> = TEST_ARGS.to_vec(); + args.push("--seconds-per-block"); + args.push(value); + args + } + + fn args_with_l1_read_stale_after_blocks(value: &str) -> Vec<&str> { + let mut args: Vec<&str> = TEST_ARGS.to_vec(); + args.push("--l1-read-stale-after-blocks"); + args.push(value); + args + } + + #[test] + fn run_config_rejects_seconds_per_block_zero() { + let err = RunConfig::try_parse_from(args_with_seconds_per_block("0")) + .expect_err("seconds_per_block=0 must be rejected"); + let message = err.to_string(); + // The exact clap wording depends on the version; the specific field is + // what we want to pin. + assert!( + message.contains("--seconds-per-block") || message.contains("seconds_per_block"), + "error must name the offending field, got: {message}" + ); + } + + #[test] + fn run_config_accepts_seconds_per_block_one() { + // One is the minimum allowed (1..). + let config = + RunConfig::try_parse_from(args_with_seconds_per_block("1")).expect("parse succeeds"); + assert_eq!(config.seconds_per_block, 1); + } + + #[test] + fn run_config_default_seconds_per_block_is_12() { + let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); + assert_eq!( + config.seconds_per_block, 12, + "default should reflect Ethereum block time" + ); + } + + #[test] + fn run_config_rejects_l1_read_stale_after_blocks_zero() { + let err = RunConfig::try_parse_from(args_with_l1_read_stale_after_blocks("0")) + .expect_err("l1_read_stale_after_blocks=0 must be rejected"); + let message = err.to_string(); + assert!( + message.contains("--l1-read-stale-after-blocks") + || message.contains("l1_read_stale_after_blocks"), + "error must name the offending field, got: {message}" + ); + } + + #[test] + fn run_config_default_l1_read_stale_after_blocks_is_600() { + // Independent default (NOT derived from margin) — see field doc. + let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); + assert_eq!(config.l1_read_stale_after_blocks, 600); + } + + #[test] + fn run_config_accepts_l1_read_stale_after_blocks_one() { + let config = RunConfig::try_parse_from(args_with_l1_read_stale_after_blocks("1")) + .expect("parse succeeds"); + assert_eq!(config.l1_read_stale_after_blocks, 1); + } } diff --git a/sequencer/src/runtime/error.rs b/sequencer/src/runtime/error.rs new file mode 100644 index 0000000..c043949 --- /dev/null +++ b/sequencer/src/runtime/error.rs @@ -0,0 +1,283 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Runtime error taxonomy. Three groupings: +//! +//! - [`BootstrapError`] / [`IdentityError`]: everything that can go wrong +//! before runtime workers come up — config validation, deployment-identity +//! guards, startup recovery, initial DB open. +//! - [`WorkerExit`] + per-worker `*Exit`: how each runtime worker exited. +//! - [`RunError`]: the top-level error returned by `run()`, with generic +//! [`std::io::Error`] / [`rusqlite::Error`] catch-alls that are used widely +//! enough not to nest. + +use thiserror::Error; + +use crate::ingress::inclusion_lane::InclusionLaneError; +use crate::l1::reader::InputReaderError; +use crate::l1::submitter::BatchSubmitterError; +use crate::recovery::{DangerDetectorError, RecoveryError}; +use crate::storage::{DangerStatus, DeploymentIdentity, StorageOpenError}; +use sequencer_core::protocol::ProtocolTimingError; + +// ── Top-level RunError ──────────────────────────────────────────────── + +/// Top-level runtime error. Grouped by phase: +/// +/// - `Bootstrap`: startup failures before runtime workers come up. +/// - `Worker`: one of the runtime workers exited (server, inclusion lane, +/// input reader, batch submitter, danger detector). +/// - `Io` / `Storage`: generic catch-alls used widely; not worth nesting. +#[derive(Debug, Error)] +pub enum RunError { + #[error("bootstrap failed: {0}")] + Bootstrap(#[from] BootstrapError), + #[error("worker exited: {0}")] + Worker(#[from] WorkerExit), + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("storage operation failed: {0}")] + Storage(#[from] rusqlite::Error), +} + +// ── Bootstrap-phase errors ───────────────────────────────────────────── + +/// Anything that can go wrong before runtime workers start: config validation, +/// deployment-identity guards, startup recovery, initial DB open. +#[derive(Debug, Error)] +pub enum BootstrapError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error("RPC chain ID {rpc} does not match --chain-id {config}")] + ChainIdMismatch { rpc: u64, config: u64 }, + /// `eth_chainId` failed on a reachable RPC. We treat this as fatal + /// rather than warn-and-continue: proceeding with an unverified chain id + /// would pin a possibly-wrong deployment identity and poison subsequent + /// L1-unreachable boots, in addition to issuing soft confirmations + /// against the wrong chain's state. Operator should retry. + #[error("could not query chain ID from RPC: {message}")] + ChainIdRpc { message: String }, + /// Protocol-level config (`preemptive_margin_blocks` vs `max_wait_blocks`, + /// `l1_read_stale_after_blocks` vs `danger_threshold`) failed validation. + /// See [`ProtocolTimingError`]. + #[error(transparent)] + InvalidProtocolTiming(#[from] ProtocolTimingError), + /// Startup recovery (or refusal) failed before runtime workers started. + #[error(transparent)] + Recovery(#[from] RecoveryError), + /// Deployment-identity guards — see [`IdentityError`]. + #[error(transparent)] + Identity(#[from] IdentityError), +} + +/// Deployment-identity failure modes. The sequencer pins itself to a specific +/// (chain_id, app_address, input_box_address, input_box_genesis_block, +/// batch_submitter_address) tuple on first successful boot, then refuses to +/// run under a different identity to prevent silently associating state from +/// one deployment with another. +#[derive(Debug, Error)] +pub enum IdentityError { + /// L1 unreachable AND no cached identity in the DB. We need at least one + /// (live L1 query OR a prior boot's pinned identity) to safely bind this + /// sequencer to a deployment. Operator: bring up L1 and retry. + #[error("first boot requires L1: no cached deployment identity and L1 is unreachable")] + FirstBootRequiresL1, + /// The DB has persisted state but no pinned identity. Binding the current + /// config now would silently inherit an unknown deployment's data. + /// Operator: confirm provenance or wipe the DB. + #[error("orphaned state: DB has persisted state but no deployment identity to claim it")] + OrphanedState, + /// The pinned identity doesn't match the current config. + /// + /// `stored` and `expected` are boxed so the enum stays small — without + /// boxing this variant alone would push `RunError`'s stack footprint past + /// 184 bytes, which clippy's `result_large_err` flags (and which inflates + /// every `Result<_, RunError>` in the codebase, even successful returns). + /// The heap allocation is paid only on the error path, which is cold. + #[error("deployment identity mismatch ({fields}); stored={stored:?}; expected={expected:?}")] + Mismatch { + fields: String, + stored: Box, + expected: Box, + }, +} + +// ── Worker exits ─────────────────────────────────────────────────────── + +/// Which runtime worker exited, and why. +#[derive(Debug, Error)] +pub enum WorkerExit { + #[error("server: {0}")] + Server(#[from] ServerExit), + #[error("inclusion lane: {0}")] + Lane(#[from] LaneExit), + #[error("input reader: {0}")] + InputReader(#[from] InputReaderExit), + #[error("batch submitter: {0}")] + BatchSubmitter(#[from] BatchSubmitterExit), + #[error("danger detector: {0}")] + DangerDetector(#[from] DangerDetectorExit), +} + +/// Generic worker exit shape: stopped without signal / errored / failed to join. +#[derive(Debug, Error)] +pub enum ServerExit { + #[error("stopped unexpectedly")] + StoppedUnexpectedly, + #[error("io error: {0}")] + Source(std::io::Error), + #[error("join error: {0}")] + Join(tokio::task::JoinError), +} + +#[derive(Debug, Error)] +pub enum LaneExit { + #[error("stopped unexpectedly")] + StoppedUnexpectedly, + #[error("{0}")] + Source(InclusionLaneError), + #[error("join error: {0}")] + Join(tokio::task::JoinError), +} + +#[derive(Debug, Error)] +pub enum InputReaderExit { + #[error("stopped unexpectedly")] + StoppedUnexpectedly, + #[error("{0}")] + Source(InputReaderError), + #[error("join error: {0}")] + Join(tokio::task::JoinError), +} + +#[derive(Debug, Error)] +pub enum BatchSubmitterExit { + #[error("stopped unexpectedly")] + StoppedUnexpectedly, + #[error("{0}")] + Source(BatchSubmitterError), + #[error("join error: {0}")] + Join(tokio::task::JoinError), +} + +/// Detector has an extra variant for the deliberate `RecoveryRequired` trip: +/// not an error per se, but causes the runtime to exit so the orchestrator +/// can respawn into startup recovery. +#[derive(Debug, Error)] +pub enum DangerDetectorExit { + #[error("stopped unexpectedly")] + StoppedUnexpectedly, + #[error("{0}")] + Source(DangerDetectorError), + #[error("join error: {0}")] + Join(tokio::task::JoinError), + #[error("danger detected ({status:?}) — stopping for startup recovery")] + DangerDetected { status: DangerStatus }, +} + +// ── Shutdown-time constructors ──────────────────────────────────────── +// +// Used during orderly shutdown (runtime-wide shutdown was already +// requested). `Ok(())` is the expected "drained cleanly" outcome and +// returns `Ok(())`; everything else surfaces as the matching error variant. +// Distinct from the select-arm `From` impls, where `Ok(())` means the worker +// stopped *before* shutdown was triggered (`StoppedUnexpectedly`). + +impl ServerExit { + pub fn from_shutdown( + result: Result, tokio::task::JoinError>, + ) -> Result<(), Self> { + match result { + Ok(Ok(())) => Ok(()), + Ok(Err(source)) => Err(Self::Source(source)), + Err(source) => Err(Self::Join(source)), + } + } +} + +impl LaneExit { + pub fn from_shutdown( + result: Result, tokio::task::JoinError>, + ) -> Result<(), Self> { + match result { + Ok(Ok(())) => Ok(()), + Ok(Err(source)) => Err(Self::Source(source)), + Err(source) => Err(Self::Join(source)), + } + } +} + +impl InputReaderExit { + pub fn from_shutdown( + result: Result, tokio::task::JoinError>, + ) -> Result<(), Self> { + match result { + Ok(Ok(())) => Ok(()), + Ok(Err(source)) => Err(Self::Source(source)), + Err(source) => Err(Self::Join(source)), + } + } +} + +impl BatchSubmitterExit { + pub fn from_shutdown( + result: Result< + Result, + tokio::task::JoinError, + >, + ) -> Result<(), Self> { + match result { + Ok(Ok(crate::l1::submitter::SubmitterExit::Shutdown)) => Ok(()), + Ok(Err(source)) => Err(Self::Source(source)), + Err(source) => Err(Self::Join(source)), + } + } +} + +impl DangerDetectorExit { + pub fn from_shutdown( + result: Result< + Result, + tokio::task::JoinError, + >, + ) -> Result<(), Self> { + match result { + Ok(Ok(crate::recovery::DetectorExit::Shutdown)) => Ok(()), + Ok(Ok(crate::recovery::DetectorExit::RecoveryRequired { status })) => { + Err(Self::DangerDetected { status }) + } + Ok(Err(source)) => Err(Self::Source(source)), + Err(source) => Err(Self::Join(source)), + } + } +} + +// ── Chained `From` impls so `?` works at the top-level RunError ──────── +// +// thiserror's `#[from]` is one-level; nested propagation needs manual +// impls. Each leaf error type that can bubble up through `?` in `run()` +// gets a direct From for RunError. + +impl From for RunError { + fn from(e: StorageOpenError) -> Self { + RunError::Bootstrap(e.into()) + } +} + +impl From for RunError { + fn from(e: ProtocolTimingError) -> Self { + RunError::Bootstrap(e.into()) + } +} + +impl From for RunError { + fn from(e: RecoveryError) -> Self { + RunError::Bootstrap(e.into()) + } +} + +impl From for RunError { + fn from(e: IdentityError) -> Self { + RunError::Bootstrap(e.into()) + } +} diff --git a/sequencer/src/runtime/mod.rs b/sequencer/src/runtime/mod.rs new file mode 100644 index 0000000..3412e85 --- /dev/null +++ b/sequencer/src/runtime/mod.rs @@ -0,0 +1,405 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Process orchestration. Three phases: +//! +//! 1. **Bootstrap**: parse config, validate identity, build the L1 config. +//! 2. **Preemptive recovery**: run the startup recovery procedure +//! ([`crate::recovery::run_preemptive_recovery`]). +//! 3. **Workers**: hand off to [`workers::Workers`] for spawn → select → +//! finish. +//! +//! Errors live in [`error`]; worker lifecycle in [`workers`]. + +pub mod clock; +pub mod config; +pub mod error; +pub mod shutdown; +mod workers; + +use std::time::Duration; + +use crate::l1::reader::{InputReader, InputReaderConfig, InputReaderError}; +use crate::storage::{self, DeploymentIdentity}; +use alloy_primitives::Address; +use config::{L1Config, RunConfig}; +use sequencer_core::application::Application; +use sequencer_core::protocol::ProtocolTiming; + +pub use error::{ + BatchSubmitterExit, BootstrapError, DangerDetectorExit, IdentityError, InputReaderExit, + LaneExit, RunError, ServerExit, WorkerExit, +}; + +use workers::{Workers, WorkersConfig}; + +const INPUT_READER_POLL_INTERVAL: Duration = Duration::from_secs(2); + +pub async fn run(app: A, config: RunConfig) -> Result<(), RunError> +where + A: Application + 'static, +{ + // ── Bootstrap ──────────────────────────────────────────── + std::fs::create_dir_all(&config.data_dir)?; + let db_path = config.db_path(); + let key = config.resolve_private_key()?; + let batch_submitter_address = batch_submitter_address_from_private_key(&key)?; + + // One ProtocolTiming shared across the whole process. `try_new` validates + // margin/stale relationships up front — including before the startup log + // below — so a bad config produces a clean typed error instead of + // panicking mid-log. + let timing = config.protocol_timing()?; + + let (mut input_reader, l1_config) = bootstrap_l1_config( + &config, + db_path.as_str(), + timing, + batch_submitter_address, + key, + ) + .await?; + + tracing::info!( + http_addr = %config.http_addr, + data_dir = %config.data_dir, + eth_rpc_url = %l1_config.eth_rpc_url, + input_box_address = %l1_config.input_box_address, + input_reader_genesis_block = input_reader.genesis_block(), + chain_id = config.chain_id, + app_address = %l1_config.app_address, + batch_submitter_address = %l1_config.batch_submitter_address, + max_wait_blocks = timing.max_wait_blocks, + preemptive_margin_blocks = timing.preemptive_margin_blocks, + danger_threshold = timing.danger_threshold(), + "sequencer startup" + ); + + // ── Preemptive recovery ────────────────────────────────── + // See docs/recovery/ for the full design and TLA+ spec. + crate::recovery::run_preemptive_recovery(&db_path, &mut input_reader, &l1_config, &timing) + .await?; + + // ── Workers ────────────────────────────────────────────── + let mut workers = Workers::spawn(WorkersConfig { + app, + run_config: config, + l1_config, + timing, + input_reader, + }) + .await?; + + let first_exit = workers.select_first_exit().await; + workers.finish(first_exit).await +} + +// ── Bootstrap helpers ────────────────────────────────────────────────── + +fn batch_submitter_address_from_private_key(private_key: &str) -> Result { + use alloy::signers::local::PrivateKeySigner; + use std::str::FromStr; + + Ok(PrivateKeySigner::from_str(private_key) + .map_err(|_| RunError::Io(std::io::Error::other("invalid private key")))? + .address()) +} + +/// Resolve `(InputReader, L1Config)` from the configured RPC, falling back to +/// the DB-pinned deployment identity when L1 is unreachable. +/// +/// On first startup, L1 is required (no cached identity to fall back on). On +/// subsequent startups, the identity allows the sequencer to start without L1 +/// while refusing to interpret the DB under a different deployment. +/// +/// The genesis block is available via `input_reader.genesis_block()` on the +/// returned reader. +async fn bootstrap_l1_config( + config: &RunConfig, + db_path: &str, + timing: ProtocolTiming, + batch_submitter_address: Address, + batch_submitter_private_key: String, +) -> Result<(InputReader, L1Config), RunError> { + let input_reader_config = InputReaderConfig { + rpc_url: config.eth_rpc_url.clone(), + app_address: config.app_address, + poll_interval: INPUT_READER_POLL_INTERVAL, + long_block_range_error_codes: config.long_block_range_error_codes.clone(), + }; + + let (input_reader, input_box_address) = match InputReader::new( + db_path.to_owned(), + input_reader_config.clone(), + batch_submitter_address, + timing, + ) + .await + { + Ok(reader) => { + let input_box = reader.input_box_address(); + + // Validate chain ID early — before any DB identity writes. + validate_rpc_chain_id(&config.eth_rpc_url, config.chain_id).await?; + + let expected_identity = DeploymentIdentity { + chain_id: config.chain_id, + app_address: config.app_address, + input_box_address: input_box, + input_box_genesis_block: reader.genesis_block(), + batch_submitter_address, + }; + ensure_deployment_identity(db_path, expected_identity)?; + + (reader, input_box) + } + Err(InputReaderError::Provider(e)) => { + tracing::error!( + error = %e, + "L1 unreachable during bootstrap — checking deployment identity" + ); + let cached = cached_deployment_identity( + db_path, + config.chain_id, + config.app_address, + batch_submitter_address, + )?; + let reader = InputReader::from_parts( + input_reader_config, + cached.input_box_address, + cached.input_box_genesis_block, + db_path.to_owned(), + batch_submitter_address, + timing, + ); + (reader, cached.input_box_address) + } + Err(source) => { + // L1 reachable but `InputReader::new` failed for a non-provider + // reason — wrap as a startup-time worker source error. + return Err(RunError::Worker(WorkerExit::InputReader( + InputReaderExit::Source(source), + ))); + } + }; + + let l1_config = L1Config { + eth_rpc_url: config.eth_rpc_url.clone(), + input_box_address, + app_address: config.app_address, + batch_submitter_private_key, + batch_submitter_address, + }; + Ok((input_reader, l1_config)) +} + +/// Verify that the RPC's `eth_chainId` matches the configured chain id. +/// +/// Treated as fatal on mismatch *and* on RPC error: pinning a wrong or +/// unverified chain id into storage would poison subsequent L1-unreachable +/// boots and issue soft confirmations against the wrong chain. Caller is +/// expected to retry on `ChainIdRpc`. +async fn validate_rpc_chain_id(eth_rpc_url: &str, expected: u64) -> Result<(), RunError> { + use alloy::providers::Provider; + let check_provider = crate::l1::provider::create_provider(eth_rpc_url) + .map_err(|e| RunError::Io(std::io::Error::other(e)))?; + match check_provider.get_chain_id().await { + Ok(rpc_chain_id) if rpc_chain_id != expected => { + Err(RunError::Bootstrap(BootstrapError::ChainIdMismatch { + rpc: rpc_chain_id, + config: expected, + })) + } + Ok(_) => Ok(()), + Err(e) => Err(RunError::Bootstrap(BootstrapError::ChainIdRpc { + message: e.to_string(), + })), + } +} + +fn ensure_deployment_identity(db_path: &str, expected: DeploymentIdentity) -> Result<(), RunError> { + let mut storage = storage::Storage::open(db_path)?; + if let Some(stored) = storage.deployment_identity()? { + return require_deployment_identity_match(stored, expected); + } + if storage.has_persisted_deployment_state()? { + return Err(IdentityError::OrphanedState.into()); + } + let stored = storage.load_or_insert_deployment_identity(expected)?; + require_deployment_identity_match(stored, expected) +} + +fn cached_deployment_identity( + db_path: &str, + chain_id: u64, + app_address: Address, + batch_submitter_address: Address, +) -> Result { + let storage = storage::Storage::open(db_path)?; + let Some(stored) = storage.deployment_identity()? else { + return Err(IdentityError::FirstBootRequiresL1.into()); + }; + let expected = DeploymentIdentity { + chain_id, + app_address, + input_box_address: stored.input_box_address, + input_box_genesis_block: stored.input_box_genesis_block, + batch_submitter_address, + }; + require_deployment_identity_match(stored, expected)?; + Ok(stored) +} + +fn require_deployment_identity_match( + stored: DeploymentIdentity, + expected: DeploymentIdentity, +) -> Result<(), RunError> { + let fields = deployment_identity_mismatch_fields(stored, expected); + if fields.is_empty() { + return Ok(()); + } + Err(IdentityError::Mismatch { + fields: fields.join(", "), + stored: Box::new(stored), + expected: Box::new(expected), + } + .into()) +} + +fn deployment_identity_mismatch_fields( + stored: DeploymentIdentity, + expected: DeploymentIdentity, +) -> Vec<&'static str> { + let mut fields = Vec::new(); + if stored.chain_id != expected.chain_id { + fields.push("chain_id"); + } + if stored.app_address != expected.app_address { + fields.push("app_address"); + } + if stored.input_box_address != expected.input_box_address { + fields.push("input_box_address"); + } + if stored.input_box_genesis_block != expected.input_box_genesis_block { + fields.push("input_box_genesis_block"); + } + if stored.batch_submitter_address != expected.batch_submitter_address { + fields.push("batch_submitter_address"); + } + fields +} + +#[cfg(test)] +mod tests { + use super::{ + BootstrapError, IdentityError, RunError, batch_submitter_address_from_private_key, + deployment_identity_mismatch_fields, ensure_deployment_identity, + require_deployment_identity_match, + }; + use crate::recovery::{RecoveryError, RefuseReason}; + use crate::storage::test_helpers::{SENDER_A, default_protocol_timing, temp_db}; + use crate::storage::{DeploymentIdentity, Storage}; + use alloy_primitives::Address; + use sequencer_core::protocol::ProtocolTimingError; + + // Margin/stale-boundary validation is exercised directly in + // `sequencer-core/src/protocol.rs`. The runtime tests below only cover + // the typed `From` conversions into `RunError` and the bootstrap-time + // identity guards. Worker `From` conversions live in + // `runtime/workers.rs`. + + #[test] + fn invalid_protocol_config_propagates_through_run_error() { + let err: RunError = ProtocolTimingError::MarginNotLessThanMaxWait { + margin: 1200, + max_wait: 1200, + } + .into(); + assert!(matches!( + err, + RunError::Bootstrap(BootstrapError::InvalidProtocolTiming(_)) + )); + } + + #[test] + fn startup_recovery_error_preserves_recovery_category() { + let err: RunError = RecoveryError::Refuse(RefuseReason::L1ViewStale).into(); + assert!(matches!( + err, + RunError::Bootstrap(BootstrapError::Recovery(RecoveryError::Refuse( + RefuseReason::L1ViewStale + ))) + )); + } + + fn identity() -> DeploymentIdentity { + DeploymentIdentity { + chain_id: 31337, + app_address: Address::repeat_byte(0x11), + input_box_address: Address::repeat_byte(0x22), + input_box_genesis_block: 42, + batch_submitter_address: Address::repeat_byte(0x33), + } + } + + #[test] + fn deployment_identity_match_accepts_same_identity() { + let identity = identity(); + require_deployment_identity_match(identity, identity).expect("same identity should match"); + } + + #[test] + fn deployment_identity_mismatch_reports_changed_fields() { + let stored = identity(); + let expected = DeploymentIdentity { + chain_id: 31338, + app_address: Address::repeat_byte(0x44), + batch_submitter_address: Address::repeat_byte(0x55), + ..stored + }; + + assert_eq!( + deployment_identity_mismatch_fields(stored, expected), + vec!["chain_id", "app_address", "batch_submitter_address"] + ); + let err = require_deployment_identity_match(stored, expected) + .expect_err("mismatch should refuse startup"); + assert!(matches!( + err, + RunError::Bootstrap(BootstrapError::Identity(IdentityError::Mismatch { fields, .. })) + if fields == "chain_id, app_address, batch_submitter_address" + )); + } + + #[test] + fn deployment_identity_refuses_non_empty_unpinned_db() { + let db = temp_db("runtime-unpinned-deployment-state"); + { + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .append_safe_inputs(0, &[], SENDER_A, &default_protocol_timing()) + .expect("seed deployment-bound state"); + } + + let err = ensure_deployment_identity(db.path.as_str(), identity()) + .expect_err("non-empty unpinned DB must refuse"); + assert!(matches!( + err, + RunError::Bootstrap(BootstrapError::Identity(IdentityError::OrphanedState)) + )); + } + + #[test] + fn invalid_private_key_error_does_not_echo_key_material() { + let secret = "0xabc123SECRET"; + let err = batch_submitter_address_from_private_key(secret) + .expect_err("invalid private key should be rejected"); + let message = err.to_string(); + + assert_eq!(message, "invalid private key"); + assert!( + !message.contains(secret), + "private key material must not be reflected in startup errors" + ); + } +} diff --git a/sequencer/src/shutdown.rs b/sequencer/src/runtime/shutdown.rs similarity index 100% rename from sequencer/src/shutdown.rs rename to sequencer/src/runtime/shutdown.rs diff --git a/sequencer/src/runtime/workers.rs b/sequencer/src/runtime/workers.rs new file mode 100644 index 0000000..a1b0b1a --- /dev/null +++ b/sequencer/src/runtime/workers.rs @@ -0,0 +1,456 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Runtime worker lifecycle: spawn → run-until-first-exit → orderly cleanup. +//! +//! [`Workers`] owns the five runtime worker handles (server, lane, input +//! reader, batch submitter, danger detector) plus the shared shutdown signal. +//! Three methods describe its lifecycle: +//! +//! - [`Workers::spawn`]: build all configs, spawn workers, return owning struct. +//! - [`Workers::select_first_exit`]: race the workers + OS shutdown signal, +//! return whichever fired first. +//! - [`Workers::finish`]: request shutdown, await each component (logging +//! cleanup-time errors), surface the primary failure. +//! +//! Worker plumbing is intentionally explicit per-worker (5 fields, 5 spawn +//! statements, 5 select arms, 5 cleanup entries). Adding a sixth worker means +//! editing each of those four sites — but each edit is obvious and local. + +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use alloy::providers::DynProvider; +use tokio::task::JoinHandle; +use tracing::warn; + +use crate::egress::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; +use crate::http::{self, ApiConfig}; +use crate::ingress::inclusion_lane::{InclusionLane, InclusionLaneConfig, InclusionLaneError}; +use crate::l1::reader::{InputReader, InputReaderError}; +use crate::l1::submitter::{ + BatchPosterConfig, BatchSubmitter, BatchSubmitterConfig, BatchSubmitterError, + EthereumBatchPoster, SubmitterExit, +}; +use crate::recovery::{DangerDetector, DangerDetectorError, DetectorExit}; +use crate::runtime::config::{L1Config, RunConfig}; +use crate::runtime::error::{ + BatchSubmitterExit, DangerDetectorExit, InputReaderExit, LaneExit, RunError, ServerExit, + WorkerExit, +}; +use crate::runtime::shutdown::ShutdownSignal; +use sequencer_core::application::Application; +use sequencer_core::protocol::ProtocolTiming; + +const QUEUE_CAPACITY: usize = 8192; +/// Danger detector cadence. Cheap DB-only check; re-running quickly bounds the +/// lag on entering the danger zone. The preemptive margin absorbs bounded lag. +const DANGER_DETECTOR_POLL_INTERVAL: Duration = Duration::from_secs(2); + +/// Which event ended the `select!` race in [`Workers::select_first_exit`]. +pub(crate) enum FirstExit { + Signal(Option), + Worker(WorkerExit), +} + +/// Inputs to [`Workers::spawn`]. Consumed entirely; the caller has nothing +/// further to do with these after the call. +/// +/// Pure derivations (`db_path`, `domain`, `input_reader.genesis_block()`) are +/// computed inside `spawn` rather than threaded through here. +pub(crate) struct WorkersConfig { + pub app: A, + pub run_config: RunConfig, + pub l1_config: L1Config, + pub timing: ProtocolTiming, + pub input_reader: InputReader, +} + +/// Owns the five worker handles + the shutdown signal that drives all of them. +/// Construction (`spawn`) and teardown (`finish`) bracket the worker +/// lifecycle. +pub(crate) struct Workers { + server: JoinHandle>, + lane: JoinHandle>, + reader: JoinHandle>, + submitter: JoinHandle>, + detector: JoinHandle>, + shutdown: ShutdownSignal, +} + +impl Workers { + /// Build the worker configs, spawn each worker, return the owning struct. + /// Logs `listening` once the HTTP server is bound. + pub(crate) async fn spawn( + cfg: WorkersConfig, + ) -> Result { + let WorkersConfig { + app, + run_config, + l1_config, + timing, + input_reader, + } = cfg; + + // Derived values — kept inside `spawn` so `WorkersConfig` stays + // minimal and these aren't computed twice in the caller. + let db_path = run_config.db_path(); + let domain = run_config.build_domain(); + let input_reader_genesis_block = input_reader.genesis_block(); + + let shutdown = ShutdownSignal::default(); + + // Inclusion lane: takes the app, returns the tx-sender the HTTP + // ingress route will publish to. + let storage = crate::storage::Storage::open(&db_path)?; + let (tx, lane) = InclusionLane::start( + QUEUE_CAPACITY, + shutdown.clone(), + app, + storage, + InclusionLaneConfig::new(l1_config.batch_submitter_address), + ); + + // Input reader: produces safe-input rows from L1. + let reader = input_reader.start(shutdown.clone())?; + + // Batch submitter: posts closed batches to L1. + let poster_config = BatchPosterConfig { + l1_submit_address: l1_config.input_box_address, + app_address: l1_config.app_address, + batch_submitter_address: l1_config.batch_submitter_address, + start_block: input_reader_genesis_block, + confirmation_depth: run_config.batch_submitter_confirmation_depth, + seconds_per_block: run_config.seconds_per_block, + long_block_range_error_codes: run_config.long_block_range_error_codes.clone(), + }; + let provider = build_batch_submitter_provider(&l1_config)?; + let poster = Arc::new(EthereumBatchPoster::new(provider, poster_config)); + let submitter_config = BatchSubmitterConfig { + idle_poll_interval_ms: run_config.batch_submitter_idle_poll_interval_ms, + }; + let submitter = BatchSubmitter::new(db_path.clone(), poster, submitter_config) + .start(shutdown.clone())?; + + // Danger detector: trips startup recovery on bad DB/L1 state. + let detector = DangerDetector::new(db_path.clone(), timing, DANGER_DETECTOR_POLL_INTERVAL) + .start(shutdown.clone())?; + + // HTTP server (ingress /tx + egress /ws/subscribe + /health, currently merged). + let tx_feed = L2TxFeed::new( + db_path.clone(), + shutdown.clone(), + L2TxFeedConfig { + batch_submitter_address: Some(l1_config.batch_submitter_address), + ..L2TxFeedConfig::default() + }, + ); + let server = http::start( + &run_config.http_addr, + tx, + domain, + A::MAX_METHOD_PAYLOAD_BYTES, + shutdown.clone(), + tx_feed, + ApiConfig::default(), + ) + .await?; + tracing::info!(address = %run_config.http_addr, "listening"); + + Ok(Self { + server, + lane, + reader, + submitter, + detector, + shutdown, + }) + } + + /// Race ctrl_c against each worker's join handle. The first to complete + /// produces the [`FirstExit`]. + pub(crate) async fn select_first_exit(&mut self) -> FirstExit { + let shutdown_signal = tokio::signal::ctrl_c(); + tokio::pin!(shutdown_signal); + tokio::select! { + signal_result = &mut shutdown_signal => signal_result.into(), + server_result = &mut self.server => server_result.into(), + lane_result = &mut self.lane => lane_result.into(), + reader_result = &mut self.reader => reader_result.into(), + submitter_result = &mut self.submitter => submitter_result.into(), + detector_result = &mut self.detector => detector_result.into(), + } + } + + /// Drive orderly cleanup: request shutdown, await each worker (logging + /// cleanup-time errors), surface the primary failure (or the signal- + /// handler error, which takes priority over component errors observed + /// during shutdown). + pub(crate) async fn finish(self, first_exit: FirstExit) -> Result<(), RunError> { + self.shutdown.request_shutdown(); + + let Self { + server, + lane, + reader, + submitter, + detector, + shutdown: _, + } = self; + let components: [(&'static str, ComponentShutdown); 5] = [ + ("server", Box::pin(wait_for_server_shutdown(server))), + ("inclusion lane", Box::pin(wait_for_lane_shutdown(lane))), + ( + "input reader", + Box::pin(wait_for_input_reader_shutdown(reader)), + ), + ( + "batch submitter", + Box::pin(wait_for_batch_submitter_shutdown(submitter)), + ), + ( + "danger detector", + Box::pin(wait_for_danger_detector_shutdown(detector)), + ), + ]; + + // Two completion modes: + // - Worker-failure: we already have the primary; await the OTHER + // components for orderly cleanup, log any cleanup errors, surface + // the primary (wrapped to RunError). + // - Signal-driven shutdown: an OS signal triggered shutdown. Wait for + // everything to drain; the signal handler's own error (if any) + // takes priority over any subsequent component shutdown error. + let (worker_failure, signal_error): (Option<(&'static str, WorkerExit)>, Option) = + match first_exit { + FirstExit::Signal(err) => (None, err), + FirstExit::Worker(exit) => { + let name = exit.component_name(); + (Some((name, exit)), None) + } + }; + + if let Some((failed, primary_exit)) = worker_failure { + for (name, fut) in components { + if name == failed { + // Drop the primary's future without awaiting — its task + // is already done (it's what tripped the select), and + // we'll surface its error directly below. + drop(fut); + continue; + } + log_cleanup_result(name, fut.await); + } + return Err(RunError::Worker(primary_exit)); + } + + // Signal path: short-circuit on first shutdown error. + let mut shutdown_error: Option = None; + for (_, fut) in components { + if let Err(e) = fut.await { + shutdown_error = Some(e); + break; + } + } + match (signal_error, shutdown_error) { + (Some(err), _) => Err(err), + (None, Some(exit)) => Err(RunError::Worker(exit)), + (None, None) => Ok(()), + } + } +} + +impl WorkerExit { + /// Human-readable component label, matching the names used in the + /// `Workers::finish` component list. + fn component_name(&self) -> &'static str { + match self { + WorkerExit::Server(_) => "server", + WorkerExit::Lane(_) => "inclusion lane", + WorkerExit::InputReader(_) => "input reader", + WorkerExit::BatchSubmitter(_) => "batch submitter", + WorkerExit::DangerDetector(_) => "danger detector", + } + } +} + +// ── `From` for FirstExit ────────────────────────────────── +// +// Each `select!` arm awaits a future and converts the result into a +// `FirstExit`. We dispatch via these `From` impls so the select arms read as +// uniform one-liners (`result.into()`); the worker-specific mapping logic +// lives here, with each input type uniquely identifying its worker. + +/// ctrl_c shutdown signal: `Ok(())` = clean signal, `Err(io)` = signal-handler +/// installation failed. +impl From> for FirstExit { + fn from(result: Result<(), std::io::Error>) -> Self { + FirstExit::Signal(result.err().map(RunError::from)) + } +} + +impl From, tokio::task::JoinError>> for FirstExit { + fn from(result: Result, tokio::task::JoinError>) -> Self { + FirstExit::Worker(WorkerExit::Server(match result { + Ok(Ok(())) => ServerExit::StoppedUnexpectedly, + Ok(Err(source)) => ServerExit::Source(source), + Err(source) => ServerExit::Join(source), + })) + } +} + +impl From, tokio::task::JoinError>> for FirstExit { + fn from(result: Result, tokio::task::JoinError>) -> Self { + FirstExit::Worker(WorkerExit::Lane(match result { + Ok(Ok(())) => LaneExit::StoppedUnexpectedly, + Ok(Err(source)) => LaneExit::Source(source), + Err(source) => LaneExit::Join(source), + })) + } +} + +impl From, tokio::task::JoinError>> for FirstExit { + fn from(result: Result, tokio::task::JoinError>) -> Self { + FirstExit::Worker(WorkerExit::InputReader(match result { + Ok(Ok(())) => InputReaderExit::StoppedUnexpectedly, + Ok(Err(source)) => InputReaderExit::Source(source), + Err(source) => InputReaderExit::Join(source), + })) + } +} + +impl From, tokio::task::JoinError>> + for FirstExit +{ + fn from( + result: Result, tokio::task::JoinError>, + ) -> Self { + FirstExit::Worker(WorkerExit::BatchSubmitter(match result { + // Worker returning `Shutdown` outside of a real shutdown means it + // stopped on its own — treat as unexpected. + Ok(Ok(SubmitterExit::Shutdown)) => BatchSubmitterExit::StoppedUnexpectedly, + Ok(Err(source)) => BatchSubmitterExit::Source(source), + Err(source) => BatchSubmitterExit::Join(source), + })) + } +} + +impl From, tokio::task::JoinError>> for FirstExit { + fn from( + result: Result, tokio::task::JoinError>, + ) -> Self { + FirstExit::Worker(WorkerExit::DangerDetector(match result { + // Detector Shutdown means its own shutdown signal fired, which + // only happens after runtime-wide shutdown was triggered. Treat + // as unexpected if it wins the select. + Ok(Ok(DetectorExit::Shutdown)) => DangerDetectorExit::StoppedUnexpectedly, + Ok(Ok(DetectorExit::RecoveryRequired { status })) => { + DangerDetectorExit::DangerDetected { status } + } + Ok(Err(source)) => DangerDetectorExit::Source(source), + Err(source) => DangerDetectorExit::Join(source), + })) + } +} + +// ── Shutdown waiters ─────────────────────────────────────────────────── +// +// Each waiter awaits a worker's JoinHandle and converts via the per-worker +// `*Exit::from_shutdown` constructor (which knows `Ok(())` is graceful). +// Same shape per worker; kept explicit for readability. + +type ComponentShutdown = Pin> + Send>>; + +async fn wait_for_server_shutdown( + server_task: JoinHandle>, +) -> Result<(), WorkerExit> { + ServerExit::from_shutdown(server_task.await).map_err(Into::into) +} + +async fn wait_for_lane_shutdown( + handle: JoinHandle>, +) -> Result<(), WorkerExit> { + LaneExit::from_shutdown(handle.await).map_err(Into::into) +} + +async fn wait_for_input_reader_shutdown( + handle: JoinHandle>, +) -> Result<(), WorkerExit> { + InputReaderExit::from_shutdown(handle.await).map_err(Into::into) +} + +async fn wait_for_batch_submitter_shutdown( + handle: JoinHandle>, +) -> Result<(), WorkerExit> { + BatchSubmitterExit::from_shutdown(handle.await).map_err(Into::into) +} + +async fn wait_for_danger_detector_shutdown( + handle: JoinHandle>, +) -> Result<(), WorkerExit> { + DangerDetectorExit::from_shutdown(handle.await).map_err(Into::into) +} + +fn log_cleanup_result(component: &str, result: Result<(), WorkerExit>) { + if let Err(err) = result { + warn!(component, error = %err, "component shutdown after primary failure also errored"); + } +} + +fn build_batch_submitter_provider(l1: &L1Config) -> Result { + crate::l1::provider::create_signer_provider(&l1.eth_rpc_url, &l1.batch_submitter_private_key) + .map_err(std::io::Error::other) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::recovery::DangerDetectorError; + use crate::storage::DangerStatus; + + // ── select!-arm `From` conversions ────────────────── + // + // The detector arm is the interesting one (DangerDetected vs Shutdown vs + // Source vs Join). The other workers follow a uniform 3-way mapping + // covered by the type system. + + type DetectorJoinResult = + Result, tokio::task::JoinError>; + + #[test] + fn detector_shutdown_in_select_maps_to_stopped_unexpectedly() { + let result: DetectorJoinResult = Ok(Ok(DetectorExit::Shutdown)); + assert!(matches!( + FirstExit::from(result), + FirstExit::Worker(WorkerExit::DangerDetector( + DangerDetectorExit::StoppedUnexpectedly + )) + )); + } + + #[test] + fn detector_recovery_required_maps_to_danger_detected() { + let result: DetectorJoinResult = Ok(Ok(DetectorExit::RecoveryRequired { + status: DangerStatus::ClosedBatchInDanger(7), + })); + assert!(matches!( + FirstExit::from(result), + FirstExit::Worker(WorkerExit::DangerDetector( + DangerDetectorExit::DangerDetected { + status: DangerStatus::ClosedBatchInDanger(7) + } + )) + )); + } + + #[test] + fn detector_inner_error_maps_to_source_variant() { + let result: DetectorJoinResult = Ok(Err(DangerDetectorError::Join("boom".into()))); + assert!(matches!( + FirstExit::from(result), + FirstExit::Worker(WorkerExit::DangerDetector(DangerDetectorExit::Source(_))) + )); + } +} diff --git a/sequencer/src/storage/admin.rs b/sequencer/src/storage/admin.rs new file mode 100644 index 0000000..c1ef8d8 --- /dev/null +++ b/sequencer/src/storage/admin.rs @@ -0,0 +1,101 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Operator/admin writes: tune fee policy parameters (`set_log_gas_price`, +//! `set_alpha`). Used today by tests and ad-hoc operator commands; not on the +//! hot path. + +use rusqlite::{Result, params}; + +use super::Storage; + +impl Storage { + pub fn set_log_gas_price(&mut self, log_gas_price: u16) -> Result<()> { + let changed = self.conn.execute( + "UPDATE batch_policy SET log_gas_price = ?1 WHERE singleton_id = 0", + params![i64::from(log_gas_price)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(()) + } + + /// Set the alpha knob from a `num/denom` rational. Computes both + /// `log_alpha` and `log_one_plus_alpha` (the policy-derived view needs + /// both). Panics if `num + denom` overflows `u64` — a misuse, not a + /// runtime condition. + pub fn set_alpha(&mut self, num: u64, denom: u64) -> Result<()> { + use sequencer_core::fee::log_fee_ratio; + + let log_alpha = log_fee_ratio(num, denom); + let one_plus_alpha_num = num.checked_add(denom).expect( + "set_alpha: num + denom overflows u64; use smaller values for the alpha fraction", + ); + let log_one_plus_alpha = log_fee_ratio(one_plus_alpha_num, denom); + + let changed = self.conn.execute( + "UPDATE batch_policy \ + SET log_alpha = ?1, log_one_plus_alpha = ?2 \ + WHERE singleton_id = 0", + params![i64::from(log_alpha), i64::from(log_one_plus_alpha)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::storage::{Storage, test_helpers::temp_db}; + + #[test] + fn high_gas_price_clamps_recommended_fee_to_max_exponent() { + let db = temp_db("clamp-fee"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // Set gas price high enough that log_recommended_fee > MAX_EXPONENT (17101). + // Default: log_recommended_fee = gas_price + 20 + 419 + 621. + // With gas_price = 17000: 17000 + 1060 = 18060 > 17101. + storage + .set_log_gas_price(17000) + .expect("set high gas price"); + + let policy = storage.batch_policy().expect("read policy"); + assert_eq!( + policy.recommended_fee, + sequencer_core::fee::MAX_EXPONENT, + "recommended_fee should be clamped to MAX_EXPONENT" + ); + + // fee_to_linear must not panic with the clamped value. + let _ = sequencer_core::fee::fee_to_linear(policy.recommended_fee); + } + + #[test] + #[should_panic(expected = "num + denom overflows u64")] + fn set_alpha_rejects_overflow() { + let db = temp_db("alpha-overflow"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage.set_alpha(u64::MAX, 1).unwrap(); + } + + /// CHECK constraint guards against alpha values that would push the batch-size + /// target past `log_max_batch_bytes`. Migrated from the old `sql.rs` test suite. + #[test] + fn batch_policy_check_rejects_unsafe_alpha() { + let db = temp_db("unsafe-alpha"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + // log_alpha=-350 → log_batch_size_target = 1403-(-350)-419 = 1334 >= log_max_batch_bytes=1333 + let err = storage.conn.execute( + "UPDATE batch_policy SET log_alpha = ?1, log_one_plus_alpha = ?2 WHERE singleton_id = 0", + [-350_i64, 0_i64], + ); + assert!( + err.is_err(), + "CHECK should reject unsafe alpha (log_batch_size_target >= log_max_batch_bytes)" + ); + } +} diff --git a/sequencer/src/storage/convert.rs b/sequencer/src/storage/convert.rs new file mode 100644 index 0000000..be26c28 --- /dev/null +++ b/sequencer/src/storage/convert.rs @@ -0,0 +1,60 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Saturating width conversions between Rust and SQLite integer types, plus +//! `SystemTime` ↔ `i64` Unix-ms conversions. +//! +//! SQLite stores integers as `INTEGER` (signed 64-bit). Rust domain types use +//! narrower unsigned widths (`u16`, `u32`, `u64`). The conversions here are +//! load-bearing glue that the rest of the storage module calls pervasively. +//! +//! All conversions saturate rather than panic — the domain values we persist +//! are always non-negative and well within `i64::MAX`, but saturation keeps +//! corrupted or malicious DB rows from crashing the process. + +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +// ── Time helpers ────────────────────────────────────────────────────────── + +pub(super) fn to_unix_ms(time: SystemTime) -> i64 { + time.duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() + .try_into() + .unwrap_or(i64::MAX) +} + +pub(super) fn from_unix_ms(ms: i64) -> SystemTime { + let clamped_ms = ms.max(0) as u64; + UNIX_EPOCH + Duration::from_millis(clamped_ms) +} + +/// Current wall-clock time as an `i64` SQLite timestamp. +/// +/// Delegates to [`crate::runtime::clock::unix_now_ms`] so the whole crate goes +/// through one clock entry point. +pub(super) fn now_unix_ms() -> i64 { + i64::try_from(crate::runtime::clock::unix_now_ms()).unwrap_or(i64::MAX) +} + +// ── Width conversions ───────────────────────────────────────────────────── + +pub(super) fn u64_to_i64(value: u64) -> i64 { + i64::try_from(value).unwrap_or(i64::MAX) +} + +pub(super) fn usize_to_i64(value: usize) -> i64 { + i64::try_from(value).unwrap_or(i64::MAX) +} + +pub(super) fn i64_to_u64(value: i64) -> u64 { + value.max(0) as u64 +} + +pub(super) fn i64_to_u16(value: i64) -> u16 { + u16::try_from(value.max(0)).unwrap_or(u16::MAX) +} + +pub(super) fn i64_to_u32(value: i64) -> u32 { + u32::try_from(value.max(0)).unwrap_or(u32::MAX) +} diff --git a/sequencer/src/storage/db.rs b/sequencer/src/storage/db.rs deleted file mode 100644 index 88f8084..0000000 --- a/sequencer/src/storage/db.rs +++ /dev/null @@ -1,1242 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use rusqlite::{Connection, OpenFlags, Result, Transaction, TransactionBehavior}; -use rusqlite_migration::{M, Migrations}; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - -use super::sql::{ - sql_count_user_ops_for_frame, sql_insert_open_batch, sql_insert_open_batch_with_index, - sql_insert_open_frame, sql_insert_safe_inputs_batch, sql_insert_sequenced_direct_inputs, - sql_insert_user_ops_batch, sql_select_batch_policy, sql_select_frames_for_batch, - sql_select_latest_batch_index, sql_select_latest_batch_with_user_op_count, - sql_select_latest_frame_in_batch_for_batch, sql_select_max_safe_input_index, - sql_select_ordered_l2_tx_count, sql_select_ordered_l2_txs_for_batch, - sql_select_ordered_l2_txs_from_offset, sql_select_ordered_l2_txs_page_from_offset, - sql_select_safe_block, sql_select_safe_inputs_range, sql_select_total_drained_direct_inputs, - sql_select_user_ops_for_frame, sql_update_batch_policy_alpha, - sql_update_batch_policy_log_gas_price, sql_update_safe_block, -}; -use super::{ - BatchPolicy, FrameHeader, SafeFrontier, SafeInputRange, StorageOpenError, StoredSafeInput, - WriteHead, -}; -use crate::inclusion_lane::PendingUserOp; -use alloy_primitives::Address; -use sequencer_core::batch::{Batch, BatchForSubmission, Frame as BatchFrame, WireUserOp}; -use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; - -const MIGRATION_0001_SCHEMA: &str = include_str!("migrations/0001_schema.sql"); - -pub struct Storage { - conn: Connection, -} - -impl Storage { - pub fn open(path: &str, synchronous: &str) -> std::result::Result { - let conn = Self::open_connection_with_migrations(path, synchronous)?; - Ok(Self { conn }) - } - - pub fn open_without_migrations( - path: &str, - synchronous: &str, - ) -> std::result::Result { - let conn = Self::open_connection(path, synchronous)?; - Ok(Self { conn }) - } - - pub fn open_read_only(path: &str) -> std::result::Result { - let conn = Self::open_connection_read_only(path)?; - Ok(Self { conn }) - } - - pub fn open_connection( - path: &str, - synchronous: &str, - ) -> std::result::Result { - let conn = Connection::open(path)?; - conn.pragma_update(None, "foreign_keys", "ON")?; - conn.pragma_update(None, "journal_mode", "WAL")?; - conn.pragma_update(None, "synchronous", synchronous)?; - conn.pragma_update(None, "busy_timeout", 5000)?; - Ok(conn) - } - - pub fn open_connection_read_only( - path: &str, - ) -> std::result::Result { - let conn = Connection::open_with_flags(path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; - conn.pragma_update(None, "query_only", "ON")?; - // Readers should fail fast under write pressure to keep tail latency bounded. - conn.pragma_update(None, "busy_timeout", 50)?; - Ok(conn) - } - - pub fn open_connection_with_migrations( - path: &str, - synchronous: &str, - ) -> std::result::Result { - let mut conn = Self::open_connection(path, synchronous)?; - Self::run_migrations(&mut conn)?; - Ok(conn) - } - - pub fn run_migrations(conn: &mut Connection) -> std::result::Result<(), StorageOpenError> { - Migrations::from_slice(&[M::up(MIGRATION_0001_SCHEMA)]).to_latest(conn)?; - Ok(()) - } - - pub fn load_next_undrained_safe_input_index(&mut self) -> Result { - let value = sql_select_total_drained_direct_inputs(&self.conn)?; - Ok(i64_to_u64(value)) - } - - pub fn safe_input_end_exclusive(&mut self) -> Result { - let value = sql_select_max_safe_input_index(&self.conn)?; - Ok(match value { - Some(last_index) => i64_to_u64(last_index).saturating_add(1), - None => 0, - }) - } - - pub fn current_safe_block(&mut self) -> Result { - let value = sql_select_safe_block(&self.conn)?; - Ok(i64_to_u64(value)) - } - - pub fn ensure_minimum_safe_block(&mut self, minimum_safe_block: u64) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - let current_safe_block = query_current_safe_block(&tx)?; - if current_safe_block < minimum_safe_block { - let changed_rows = sql_update_safe_block(&tx, u64_to_i64(minimum_safe_block))?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - } - tx.commit()?; - Ok(()) - } - - pub fn load_safe_frontier(&mut self) -> Result { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Deferred)?; - let safe_block = query_current_safe_block(&tx)?; - let end_exclusive = query_latest_safe_input_index_exclusive(&tx)?; - tx.commit()?; - Ok(SafeFrontier { - safe_block, - end_exclusive, - }) - } - - /// Scan safe-input payloads for `sender` in pages, SSZ-decode each payload - /// to extract the batch nonce, and compute the longest contiguous nonce - /// prefix starting from 0. Memory is bounded by `page_size` payloads per - /// iteration rather than the full table. - pub fn advance_safe_batch_nonce_for_sender( - &mut self, - sender: Address, - page_size: u64, - ) -> Result<(u64, u64)> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Deferred)?; - let safe_block = query_current_safe_block(&tx)?; - - const SQL: &str = "SELECT safe_input_index, payload FROM safe_inputs \ - WHERE sender = ?1 AND safe_input_index >= ?2 \ - ORDER BY safe_input_index ASC LIMIT ?3"; - let mut expected: u64 = 0; - let mut offset: i64 = 0; - let limit = i64::try_from(page_size).unwrap_or(i64::MAX); - loop { - let mut stmt = tx.prepare_cached(SQL)?; - let mut rows = stmt.query(rusqlite::params![sender.as_slice(), offset, limit])?; - let mut fetched_rows: i64 = 0; - while let Some(row) = rows.next()? { - fetched_rows += 1; - offset = row.get::<_, i64>(0)?.saturating_add(1); - let payload: Vec = row.get(1)?; - if let Ok(batch) = ::from_ssz_bytes(&payload) - && batch.nonce == expected - { - expected = expected.saturating_add(1); - } - } - if fetched_rows < limit { - break; - } - } - - tx.commit()?; - Ok((safe_block, expected)) - } - - pub fn fill_safe_inputs( - &mut self, - from_inclusive: u64, - to_exclusive: u64, - out: &mut Vec, - ) -> Result<()> { - assert!( - from_inclusive <= to_exclusive, - "invalid safe-input interval [{from_inclusive}, {to_exclusive})" - ); - - if from_inclusive == to_exclusive { - return Ok(()); - } - - let rows = sql_select_safe_inputs_range( - &self.conn, - u64_to_i64(from_inclusive), - u64_to_i64(to_exclusive), - )?; - - let mut fetched_count = 0_u64; - for (offset, row) in rows.into_iter().enumerate() { - let index = i64_to_u64(row.safe_input_index); - let expected = from_inclusive.saturating_add(offset as u64); - - assert_eq!( - index, expected, - "non-contiguous safe-input index: expected {expected}, found {index}" - ); - - out.push(StoredSafeInput { - sender: Address::from_slice(row.sender.as_slice()), - payload: row.payload, - block_number: i64_to_u64(row.block_number), - }); - fetched_count = fetched_count.saturating_add(1); - } - - assert_eq!( - from_inclusive.saturating_add(fetched_count), - to_exclusive, - "safe-input interval [{from_inclusive}, {to_exclusive}) not fully populated" - ); - - Ok(()) - } - - pub fn append_safe_inputs( - &mut self, - safe_block: u64, - inputs: &[StoredSafeInput], - ) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - - let current_safe_block = query_current_safe_block(&tx)?; - assert!( - safe_block >= current_safe_block, - "safe block regressed: current={current_safe_block}, next={safe_block}" - ); - assert!( - safe_block > current_safe_block || inputs.is_empty(), - "safe block must advance when appending new safe inputs" - ); - - let next_expected = query_latest_safe_input_index_exclusive(&tx)?; - sql_insert_safe_inputs_batch(&tx, next_expected, inputs)?; - let changed_rows = sql_update_safe_block(&tx, u64_to_i64(safe_block))?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - - tx.commit()?; - Ok(()) - } - - pub fn load_open_state(&mut self) -> Result> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Deferred)?; - let head = load_current_write_head(&tx)?; - tx.commit()?; - Ok(head) - } - - pub fn initialize_open_state( - &mut self, - safe_block: u64, - leading_direct_range: SafeInputRange, - ) -> Result { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - assert!( - load_current_write_head(&tx)?.is_none(), - "open state already exists" - ); - - let now_ms = now_unix_ms(); - let policy = query_batch_policy(&tx)?; - insert_open_batch_with_index(&tx, 0, now_ms)?; - insert_open_frame(&tx, 0, 0, now_ms, policy.recommended_fee, safe_block)?; - persist_frame_direct_sequence(&tx, 0, 0, leading_direct_range)?; - tx.commit()?; - - Ok(WriteHead { - batch_index: 0, - batch_created_at: from_unix_ms(now_ms), - frame_fee: policy.recommended_fee, - safe_block, - batch_user_op_count: 0, - open_frame_user_op_count: 0, - frame_in_batch: 0, - max_batch_user_op_bytes: super::batch_size_target_bytes(policy), - }) - } - - pub fn batch_policy(&mut self) -> Result { - let (log_recommended_fee, log_batch_size_target) = sql_select_batch_policy(&self.conn)?; - let max_exp = sequencer_core::fee::MAX_EXPONENT; - Ok(BatchPolicy { - // Clamp to MAX_EXPONENT to prevent panics in fee_to_linear. - recommended_fee: i64_to_u16(log_recommended_fee).min(max_exp), - batch_size_target: i64_to_u16(log_batch_size_target).min(max_exp), - }) - } - - pub fn set_log_gas_price(&mut self, log_gas_price: u16) -> Result<()> { - let changed_rows = - sql_update_batch_policy_log_gas_price(&self.conn, i64::from(log_gas_price))?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - Ok(()) - } - - pub fn set_alpha(&mut self, num: u64, denom: u64) -> Result<()> { - use sequencer_core::fee::log_fee_ratio; - - let log_alpha = log_fee_ratio(num, denom); - let one_plus_alpha_num = num.checked_add(denom).expect( - "set_alpha: num + denom overflows u64; use smaller values for the alpha fraction", - ); - let log_one_plus_alpha = log_fee_ratio(one_plus_alpha_num, denom); - - let changed_rows = sql_update_batch_policy_alpha( - &self.conn, - i64::from(log_alpha), - i64::from(log_one_plus_alpha), - )?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - Ok(()) - } - - pub fn append_user_ops_chunk( - &mut self, - head: &mut WriteHead, - user_ops: &[PendingUserOp], - ) -> Result<()> { - if user_ops.is_empty() { - return Ok(()); - } - - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - // Keep the invariant check inside the write transaction so validation and writes - // observe the same database snapshot. - assert_write_head_matches_open_state(&tx, head)?; - - sql_insert_user_ops_batch( - &tx, - u64_to_i64(head.batch_index), - i64::from(head.frame_in_batch), - head.open_frame_user_op_count, - user_ops, - )?; - - tx.commit()?; - head.increment_batch_user_op_count(user_ops.len()); - Ok(()) - } - - pub fn close_frame_only( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - leading_direct_range: SafeInputRange, - ) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - assert_write_head_matches_open_state(&tx, head)?; - let now_ms = now_unix_ms(); - let policy = query_batch_policy(&tx)?; - let next_frame_in_batch = head.frame_in_batch.saturating_add(1); - insert_open_frame( - &tx, - head.batch_index, - next_frame_in_batch, - now_ms, - policy.recommended_fee, - next_safe_block, - )?; - persist_frame_direct_sequence( - &tx, - head.batch_index, - next_frame_in_batch, - leading_direct_range, - )?; - tx.commit()?; - head.advance_frame(policy, next_safe_block); - Ok(()) - } - - pub fn close_frame_and_batch( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - ) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - assert_write_head_matches_open_state(&tx, head)?; - let now_ms = now_unix_ms(); - // Batch policy is sampled here: the derived fee is committed to the newly - // opened frame, and the batch size target is stored on the write head. - let policy = query_batch_policy(&tx)?; - let next_batch_index = insert_open_batch(&tx, now_ms)?; - insert_open_frame( - &tx, - next_batch_index, - 0, - now_ms, - policy.recommended_fee, - next_safe_block, - )?; - tx.commit()?; - head.move_to_next_batch( - next_batch_index, - from_unix_ms(now_ms), - policy, - next_safe_block, - ); - Ok(()) - } - - pub fn load_ordered_l2_txs_from(&mut self, offset: u64) -> Result> { - // Read the persisted total order used by catch-up and downstream feed readers. - let rows = sql_select_ordered_l2_txs_from_offset(&self.conn, u64_to_i64(offset))?; - Ok(decode_ordered_l2_txs(rows)) - } - - pub fn load_ordered_l2_txs_page_from( - &mut self, - offset: u64, - limit: usize, - ) -> Result> { - if limit == 0 { - return Ok(Vec::new()); - } - - let rows = sql_select_ordered_l2_txs_page_from_offset( - &self.conn, - u64_to_i64(offset), - usize_to_i64(limit), - )?; - Ok(decode_ordered_l2_txs(rows)) - } - - pub fn ordered_l2_tx_count(&mut self) -> Result { - let value = sql_select_ordered_l2_tx_count(&self.conn)?; - Ok(i64_to_u64(value)) - } - - pub fn latest_batch_index(&mut self) -> Result> { - let value = sql_select_latest_batch_index(&self.conn)?; - Ok(value.map(i64_to_u64)) - } - - pub fn load_frames_for_batch(&mut self, batch_index: u64) -> Result> { - let rows = sql_select_frames_for_batch(&self.conn, u64_to_i64(batch_index))?; - Ok(rows - .into_iter() - .map(|row| FrameHeader { - frame_in_batch: i64_to_u32(row.frame_in_batch), - fee: i64_to_u16(row.fee), - safe_block: i64_to_u64(row.safe_block), - }) - .collect()) - } - - pub fn load_ordered_l2_txs_for_batch( - &mut self, - batch_index: u64, - ) -> Result> { - let rows = sql_select_ordered_l2_txs_for_batch(&self.conn, u64_to_i64(batch_index))?; - Ok(decode_ordered_l2_txs(rows)) - } - - pub fn load_batch_for_submission(&mut self, batch_index: u64) -> Result { - let created_at_ms: i64 = self.conn.query_row( - "SELECT created_at_ms FROM batches WHERE batch_index = ?1 LIMIT 1", - [u64_to_i64(batch_index)], - |row| row.get(0), - )?; - - let frame_headers = self.load_frames_for_batch(batch_index)?; - let mut frames = Vec::with_capacity(frame_headers.len()); - - for header in frame_headers { - let rows = sql_select_user_ops_for_frame( - &self.conn, - u64_to_i64(batch_index), - i64::from(header.frame_in_batch), - )?; - - let user_ops = rows - .into_iter() - .map(|row| WireUserOp { - nonce: i64_to_u32(row.nonce), - max_fee: i64_to_u16(row.max_fee), - data: row.data, - signature: row.sig, - }) - .collect(); - - frames.push(BatchFrame { - user_ops, - safe_block: header.safe_block, - fee_price: header.fee, - }); - } - - let batch = Batch { - nonce: batch_index, - frames, - }; - let created_at_ms_u64 = created_at_ms.max(0) as u64; - - Ok(BatchForSubmission { - batch_index, - created_at_ms: created_at_ms_u64, - batch, - }) - } -} - -fn decode_ordered_l2_txs(rows: Vec) -> Vec { - let mut out = Vec::new(); - - for row in rows { - if row.kind == 0 { - let sender_bytes = row.sender.expect("ordered replay row: missing sender"); - assert_eq!( - sender_bytes.len(), - 20, - "ordered replay row: sender must be 20 bytes" - ); - - let entry = ValidUserOp { - sender: Address::from_slice(sender_bytes.as_slice()), - // Replay uses the persisted frame fee (log-space exponent) to mirror canonical execution. - fee: i64_to_u16(row.fee.expect("ordered replay row: missing fee")), - data: row.data.expect("ordered replay row: missing data"), - }; - out.push(SequencedL2Tx::UserOp(entry)); - } else { - let direct = DirectInput { - sender: Address::from_slice( - row.sender - .expect("ordered replay row: missing sender") - .as_slice(), - ), - block_number: i64_to_u64( - row.block_number - .expect("ordered replay row: missing block_number"), - ), - payload: row.payload.expect("ordered replay row: missing payload"), - }; - out.push(SequencedL2Tx::Direct(direct)); - } - } - - out -} - -fn load_current_write_head(tx: &Transaction<'_>) -> Result> { - let Some((batch_index, batch_created_at, batch_user_op_count)) = query_latest_batch(tx)? else { - return Ok(None); - }; - let (frame_in_batch, frame_fee, safe_block) = query_latest_frame_in_batch(tx, batch_index)?; - let open_frame_user_op_count = query_frame_user_op_count(tx, batch_index, frame_in_batch)?; - let policy = query_batch_policy(tx)?; - Ok(Some(WriteHead { - batch_index, - batch_created_at, - frame_fee, - safe_block, - batch_user_op_count, - open_frame_user_op_count, - frame_in_batch, - max_batch_user_op_bytes: super::batch_size_target_bytes(policy), - })) -} - -fn assert_write_head_matches_open_state(tx: &Transaction<'_>, expected: &WriteHead) -> Result<()> { - let actual = load_current_write_head(tx)?.expect("stale WriteHead: storage has no open state"); - assert_eq!( - expected.batch_index, actual.batch_index, - "stale WriteHead: batch_index mismatch" - ); - assert_eq!( - expected.frame_in_batch, actual.frame_in_batch, - "stale WriteHead: frame_in_batch mismatch" - ); - assert_eq!( - expected.batch_user_op_count, actual.batch_user_op_count, - "stale WriteHead: batch_user_op_count mismatch" - ); - assert_eq!( - expected.open_frame_user_op_count, actual.open_frame_user_op_count, - "stale WriteHead: open_frame_user_op_count mismatch" - ); - assert_eq!( - expected.frame_fee, actual.frame_fee, - "stale WriteHead: frame_fee mismatch" - ); - assert_eq!( - expected.safe_block, actual.safe_block, - "stale WriteHead: safe_block mismatch" - ); - assert_eq!( - to_unix_ms(expected.batch_created_at), - to_unix_ms(actual.batch_created_at), - "stale WriteHead: batch_created_at mismatch" - ); - Ok(()) -} - -fn query_latest_batch(tx: &Transaction<'_>) -> Result> { - match sql_select_latest_batch_with_user_op_count(tx) { - Ok((batch_index, batch_created_at_ms, batch_user_op_count)) => Ok(Some(( - i64_to_u64(batch_index), - from_unix_ms(batch_created_at_ms), - i64_to_u64(batch_user_op_count), - ))), - Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), - Err(source) => Err(source), - } -} - -fn query_latest_frame_in_batch(tx: &Transaction<'_>, batch_index: u64) -> Result<(u32, u16, u64)> { - let (frame_in_batch, frame_fee, safe_block) = - sql_select_latest_frame_in_batch_for_batch(tx, u64_to_i64(batch_index))?; - Ok(( - i64_to_u32(frame_in_batch), - i64_to_u16(frame_fee), - i64_to_u64(safe_block), - )) -} - -fn query_frame_user_op_count( - tx: &Transaction<'_>, - batch_index: u64, - frame_in_batch: u32, -) -> Result { - let value = - sql_count_user_ops_for_frame(tx, u64_to_i64(batch_index), i64::from(frame_in_batch))?; - Ok(i64_to_u32(value)) -} - -fn query_latest_safe_input_index_exclusive(tx: &Connection) -> Result { - let value = sql_select_max_safe_input_index(tx)?; - Ok(match value { - Some(last_index) => i64_to_u64(last_index).saturating_add(1), - None => 0, - }) -} - -fn query_current_safe_block(tx: &Connection) -> Result { - let value = sql_select_safe_block(tx)?; - Ok(i64_to_u64(value)) -} - -fn query_batch_policy(tx: &Transaction<'_>) -> Result { - let (log_recommended_fee, log_batch_size_target) = sql_select_batch_policy(tx)?; - let max_exp = sequencer_core::fee::MAX_EXPONENT; - Ok(BatchPolicy { - // Clamp to MAX_EXPONENT to prevent panics in fee_to_linear. - recommended_fee: i64_to_u16(log_recommended_fee).min(max_exp), - batch_size_target: i64_to_u16(log_batch_size_target).min(max_exp), - }) -} - -fn persist_frame_direct_sequence( - tx: &Transaction<'_>, - batch_index: u64, - frame_in_batch: u32, - drained_direct_range: SafeInputRange, -) -> Result<()> { - sql_insert_sequenced_direct_inputs( - tx, - u64_to_i64(batch_index), - i64::from(frame_in_batch), - drained_direct_range, - ) -} - -fn insert_open_batch(tx: &Transaction<'_>, created_at_ms: i64) -> Result { - sql_insert_open_batch(tx, created_at_ms)?; - Ok(i64_to_u64(tx.last_insert_rowid())) -} - -fn insert_open_batch_with_index( - tx: &Transaction<'_>, - batch_index: u64, - created_at_ms: i64, -) -> Result<()> { - sql_insert_open_batch_with_index(tx, u64_to_i64(batch_index), created_at_ms)?; - Ok(()) -} - -fn insert_open_frame( - tx: &Transaction<'_>, - batch_index: u64, - frame_in_batch: u32, - created_at_ms: i64, - frame_fee: u16, - safe_block: u64, -) -> Result<()> { - sql_insert_open_frame( - tx, - u64_to_i64(batch_index), - i64::from(frame_in_batch), - created_at_ms, - i64::from(frame_fee), - u64_to_i64(safe_block), - )?; - Ok(()) -} - -fn to_unix_ms(time: SystemTime) -> i64 { - time.duration_since(UNIX_EPOCH) - .unwrap_or_default() - .as_millis() - .try_into() - .unwrap_or(i64::MAX) -} - -fn from_unix_ms(ms: i64) -> SystemTime { - let clamped_ms = ms.max(0) as u64; - UNIX_EPOCH + Duration::from_millis(clamped_ms) -} - -fn now_unix_ms() -> i64 { - to_unix_ms(SystemTime::now()) -} - -fn u64_to_i64(value: u64) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -fn usize_to_i64(value: usize) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -fn i64_to_u64(value: i64) -> u64 { - value.max(0) as u64 -} - -fn i64_to_u16(value: i64) -> u16 { - u16::try_from(value.max(0)).unwrap_or(u16::MAX) -} - -fn i64_to_u32(value: i64) -> u32 { - u32::try_from(value.max(0)).unwrap_or(u32::MAX) -} - -#[cfg(test)] -mod tests { - use alloy_primitives::Address; - - use super::Storage; - use crate::storage::{SafeInputRange, StoredSafeInput}; - use sequencer_core::l2_tx::SequencedL2Tx; - use tempfile::TempDir; - - struct TestDb { - _dir: TempDir, - path: String, - } - - fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } - } - - #[test] - fn open_state_is_idempotent_and_rotation_is_atomic() { - let db = temp_db("open-state"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - assert!( - storage - .load_open_state() - .expect("load open state") - .is_none(), - "fresh storage should not have an open frame yet" - ); - - let head_a = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let head_b = storage - .load_open_state() - .expect("load existing open state") - .expect("open state should now exist"); - - assert_eq!(head_a.batch_index, head_b.batch_index); - assert_eq!(head_a.frame_in_batch, head_b.frame_in_batch); - assert_eq!(head_a.frame_fee, head_b.frame_fee); - // Default log_recommended_fee = 0+20+419+621 = 1060 - assert_eq!(head_a.frame_fee, 1060); - - let mut head_c = head_b; - let next_safe_block = head_c.safe_block; - storage - .close_frame_only(&mut head_c, next_safe_block, SafeInputRange::empty_at(0)) - .expect("rotate within same batch"); - assert_eq!(head_c.batch_index, head_b.batch_index); - assert_eq!(head_c.frame_in_batch, 1); - - let mut head_d = head_c; - let next_safe_block = head_d.safe_block; - storage - .close_frame_and_batch(&mut head_d, next_safe_block) - .expect("close batch and rotate"); - assert!(head_d.batch_index > head_c.batch_index); - assert_eq!(head_d.frame_in_batch, 0); - } - - #[test] - fn next_frame_fee_comes_from_batch_policy() { - let db = temp_db("batch-policy-fee"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - let policy = storage.batch_policy().expect("default policy"); - // Default: log_gas_price=0, log_recommended_fee = 0+20+419+621 = 1060 - assert_eq!(policy.recommended_fee, 1060); - - storage.set_log_gas_price(100).expect("set log gas price"); - - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let next_safe_block = head.safe_block; - storage - .close_frame_and_batch(&mut head, next_safe_block) - .expect("rotate batch"); - - let policy = storage.batch_policy().expect("read policy"); - // log_recommended_fee = 100+20+419+621 = 1160 - assert_eq!(head.frame_fee, 1160); - assert_eq!(head.frame_fee, policy.recommended_fee); - assert!( - head.max_batch_user_op_bytes > 0, - "batch size target should be set" - ); - } - - #[test] - fn high_gas_price_clamps_recommended_fee_to_max_exponent() { - let db = temp_db("clamp-fee"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - // Set gas price high enough that log_recommended_fee > MAX_EXPONENT (17101). - // Default: log_recommended_fee = gas_price + 20 + 419 + 621. - // With gas_price = 17000: 17000 + 1060 = 18060 > 17101. - storage - .set_log_gas_price(17000) - .expect("set high gas price"); - - let policy = storage.batch_policy().expect("read policy"); - assert_eq!( - policy.recommended_fee, - sequencer_core::fee::MAX_EXPONENT, - "recommended_fee should be clamped to MAX_EXPONENT" - ); - - // fee_to_linear must not panic with the clamped value. - let _ = sequencer_core::fee::fee_to_linear(policy.recommended_fee); - } - - #[test] - #[should_panic(expected = "num + denom overflows u64")] - fn set_alpha_rejects_overflow() { - let db = temp_db("alpha-overflow"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - storage.set_alpha(u64::MAX, 1).unwrap(); - } - - #[test] - fn replay_returns_direct_inputs_in_drain_order() { - let db = temp_db("replay-order"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - let head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - let drained = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xaa], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xbb], - block_number: 10, - }, - ]; - storage - .append_safe_inputs(10, drained.as_slice()) - .expect("insert direct inputs"); - let mut head = head; - storage - .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) - .expect("close frame with directs"); - - let replay = storage.load_ordered_l2_txs_from(0).expect("load replay"); - assert_eq!(replay.len(), 2); - match &replay[0] { - SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xaa]), - _ => panic!("expected direct input at position 0"), - } - match &replay[1] { - SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xbb]), - _ => panic!("expected direct input at position 1"), - } - } - - #[test] - fn next_undrained_safe_input_index_is_derived_from_sequenced_directs() { - let db = temp_db("safe-cursor"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - assert_eq!( - storage - .load_next_undrained_safe_input_index() - .expect("empty cursor"), - 0 - ); - - let head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let drained = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0x00], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0x02], - block_number: 10, - }, - ]; - storage - .append_safe_inputs(10, drained.as_slice()) - .expect("insert direct inputs"); - let mut head = head; - storage - .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) - .expect("close frame with directs"); - - assert_eq!( - storage - .load_next_undrained_safe_input_index() - .expect("derived cursor"), - 2 - ); - } - - #[test] - fn safe_input_api_uses_half_open_intervals() { - let db = temp_db("safe-input-api"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 0); - let mut out = Vec::new(); - storage - .fill_safe_inputs(0, 0, &mut out) - .expect("query empty interval"); - assert!(out.is_empty()); - - let inserted = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xa0], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xb1], - block_number: 10, - }, - ]; - storage - .append_safe_inputs(10, inserted.as_slice()) - .expect("insert safe directs"); - - assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 2); - - storage - .fill_safe_inputs(0, 2, &mut out) - .expect("query full interval"); - assert_eq!(out, inserted); - - out.clear(); - storage - .fill_safe_inputs(1, 1, &mut out) - .expect("query empty half-open interval"); - assert!(out.is_empty()); - } - - #[test] - fn ensure_minimum_safe_block_only_moves_forward() { - let db = temp_db("ensure-min-safe-block"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - storage - .ensure_minimum_safe_block(7) - .expect("advance bootstrap safe head"); - assert_eq!(storage.current_safe_block().expect("read advanced"), 7); - - storage - .ensure_minimum_safe_block(3) - .expect("do not regress bootstrap safe head"); - assert_eq!(storage.current_safe_block().expect("read unchanged"), 7); - } - - #[test] - fn initialize_open_state_creates_first_real_batch_and_frame() { - let db = temp_db("initialize-open-state"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - let head = storage - .initialize_open_state(12, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - assert_eq!(head.batch_index, 0); - assert_eq!(head.frame_in_batch, 0); - assert_eq!(head.safe_block, 12); - - let loaded = storage - .load_open_state() - .expect("load open state") - .expect("open state should exist"); - assert_eq!(loaded.batch_index, 0); - assert_eq!(loaded.frame_in_batch, 0); - assert_eq!(loaded.safe_block, 12); - } - - #[test] - fn batch_for_submission_builds_from_storage() { - let db = temp_db("batch-for-submission"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - let head = storage - .initialize_open_state(12, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - assert_eq!(head.batch_index, 0); - - let batch = storage - .load_batch_for_submission(0) - .expect("load batch for submission"); - - assert_eq!(batch.batch_index, 0); - assert_eq!(batch.batch.frames.len(), 1); - let frame = &batch.batch.frames[0]; - assert!(frame.user_ops.is_empty()); - assert_eq!(frame.safe_block, 12); - // Default log_recommended_fee = 0+20+419+621 = 1060 - assert_eq!(frame.fee_price, 1060); - assert!(batch.created_at_ms > 0); - } - - #[test] - fn batch_level_helpers_expose_latest_index_frames_and_txs() { - let db = temp_db("batch-level-helpers"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - // Before initialization there should be no batches. - assert!( - storage - .latest_batch_index() - .expect("query latest batch nonce on empty db") - .is_none() - ); - - // Initialize first batch/frame and append some data. - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - // Close current batch and move to next so batch 0 becomes closed. - let next_safe_block = head.safe_block; - storage - .close_frame_and_batch(&mut head, next_safe_block) - .expect("close batch and rotate"); - - // Latest batch nonce should now be 1 (open), with batch 0 closed. - let latest = storage - .latest_batch_index() - .expect("query latest batch nonce") - .expect("latest batch should exist"); - assert_eq!(latest, 1); - - // Batch 0 should still have at least one frame header. - let frames = storage - .load_frames_for_batch(0) - .expect("load frames for batch 0"); - assert!(!frames.is_empty()); - - // Ordered L2 txs for batch 0 should be queryable (even if empty). - let txs = storage - .load_ordered_l2_txs_for_batch(0) - .expect("load l2 txs for batch 0"); - assert!( - txs.is_empty(), - "fresh batch should not have sequenced txs yet" - ); - } - - /// Helper: insert safe inputs whose payloads are SSZ-encoded batches with - /// the given nonces, all attributed to `sender`. - fn seed_safe_inputs_with_batch_nonces( - storage: &mut Storage, - sender: Address, - safe_block: u64, - nonces: &[u64], - ) { - let inputs: Vec = nonces - .iter() - .map(|nonce| StoredSafeInput { - sender, - payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { - nonce: *nonce, - frames: Vec::new(), - }), - block_number: safe_block, - }) - .collect(); - storage - .append_safe_inputs(safe_block, inputs.as_slice()) - .expect("append safe inputs"); - } - - const SENDER_A: Address = Address::repeat_byte(0xAA); - const SENDER_B: Address = Address::repeat_byte(0xBB); - - #[test] - fn advance_safe_batch_nonce_returns_zero_when_no_inputs_exist() { - let db = temp_db("advance-nonce-empty"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 256) - .expect("advance nonce"); - assert_eq!(next, 0); - } - - #[test] - fn advance_safe_batch_nonce_contiguous_prefix() { - let db = temp_db("advance-nonce-contiguous"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2]); - - let (safe_block, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 256) - .expect("advance nonce"); - assert_eq!(safe_block, 10); - assert_eq!(next, 3); - } - - #[test] - fn advance_safe_batch_nonce_stops_at_gap() { - let db = temp_db("advance-nonce-gap"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - // nonces: 0, 1, 3, 4, 5 — gap at 2 - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 3, 4, 5]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 256) - .expect("advance nonce"); - assert_eq!(next, 2); - } - - #[test] - fn advance_safe_batch_nonce_works_across_page_boundaries() { - let db = temp_db("advance-nonce-paged"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - // 5 contiguous nonces with page_size=2 → 3 pages - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2, 3, 4]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 2) - .expect("advance nonce"); - assert_eq!(next, 5); - } - - #[test] - fn advance_safe_batch_nonce_gap_spans_page_boundary() { - let db = temp_db("advance-nonce-gap-across-page"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - // page_size=2: page0=[0,1], page1=[3,4], page2=[5] - // gap at nonce 2 — should still detect it - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 3, 4, 5]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 2) - .expect("advance nonce"); - assert_eq!(next, 2); - } - - #[test] - fn advance_safe_batch_nonce_filters_by_sender() { - let db = temp_db("advance-nonce-sender-filter"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2]); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_B, 11, &[0]); - - let (_, next_a) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 2) - .expect("advance nonce A"); - let (_, next_b) = storage - .advance_safe_batch_nonce_for_sender(SENDER_B, 2) - .expect("advance nonce B"); - assert_eq!(next_a, 3); - assert_eq!(next_b, 1); - } - - #[test] - fn advance_safe_batch_nonce_page_size_one() { - let db = temp_db("advance-nonce-page-1"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 1) - .expect("advance nonce"); - assert_eq!(next, 3); - } -} diff --git a/sequencer/src/storage/egress.rs b/sequencer/src/storage/egress.rs new file mode 100644 index 0000000..dbf98e7 --- /dev/null +++ b/sequencer/src/storage/egress.rs @@ -0,0 +1,134 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Egress reader: ordered-L2-tx queries used by the WS feed and catch-up replay. +//! +//! Read-only — every method here either pages the `valid_sequenced_l2_txs` view +//! or counts over it. The view encapsulates the exclusion of invalidated batches +//! so callers don't repeat the filter. + +use alloy_primitives::Address; +use rusqlite::{Result, params}; + +use super::Storage; +use super::convert::{i64_to_u64, u64_to_i64, usize_to_i64}; +use super::queries::decode_l2_tx_row; +use sequencer_core::l2_tx::SequencedL2Tx; + +impl Storage { + /// Load a page of ordered L2 transactions starting after the given offset. + /// Returns `(db_offset, tx)` pairs. Callers should track `db_offset` of the + /// last item as their cursor, not increment a counter. + pub fn ordered_l2_txs_page_from( + &mut self, + offset: u64, + limit: usize, + ) -> Result> { + if limit == 0 { + return Ok(Vec::new()); + } + + const SQL: &str = " + SELECT + s.offset, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, + CASE + WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender + WHEN s.safe_input_index IS NOT NULL THEN d.sender + ELSE NULL + END AS sender, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number + FROM valid_sequenced_l2_txs s + LEFT JOIN user_ops u + ON u.batch_index = s.batch_index + AND u.frame_in_batch = s.frame_in_batch + AND u.pos_in_frame = s.user_op_pos_in_frame + LEFT JOIN frames f + ON f.batch_index = s.batch_index + AND f.frame_in_batch = s.frame_in_batch + LEFT JOIN safe_inputs d + ON d.safe_input_index = s.safe_input_index + WHERE s.offset > ?1 + ORDER BY s.offset ASC + LIMIT ?2 + "; + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map(params![u64_to_i64(offset), usize_to_i64(limit)], |row| { + let db_offset: i64 = row.get(0)?; + let tx = decode_l2_tx_row( + row.get(1)?, + row.get(2)?, + row.get(3)?, + row.get(4)?, + row.get(5)?, + row.get(6)?, + ); + Ok((i64_to_u64(db_offset), tx)) + })?; + rows.collect::>>() + } + + /// Returns the maximum offset in `valid_sequenced_l2_txs`, or 0 if empty. + /// Used as the head cursor for feed subscribers. + pub fn ordered_l2_tx_head_offset(&mut self) -> Result { + let value: Option = self.conn.query_row( + "SELECT MAX(offset) FROM valid_sequenced_l2_txs", + [], + |row| row.get(0), + )?; + Ok(value.map(i64_to_u64).unwrap_or(0)) + } + + /// Count broadcastable events with offset > `from_offset`, capped at `limit`. + /// + /// Used for catch-up window checks. Excludes batch-submitter direct inputs + /// (which are filtered before WS delivery) so the count reflects what the + /// client actually receives. + pub fn count_broadcastable_events_after( + &mut self, + from_offset: u64, + limit: u64, + batch_submitter_address: Option

, + ) -> Result { + if limit == 0 { + return Ok(0); + } + + let value: i64 = match batch_submitter_address { + Some(addr) => { + const SQL: &str = " + SELECT COUNT(*) FROM ( + SELECT 1 FROM valid_sequenced_l2_txs s + WHERE s.offset > ?1 + AND NOT (s.safe_input_index IS NOT NULL + AND EXISTS (SELECT 1 FROM safe_inputs si + WHERE si.safe_input_index = s.safe_input_index + AND si.sender = ?2)) + LIMIT ?3 + )"; + self.conn.query_row( + SQL, + params![u64_to_i64(from_offset), addr.as_slice(), u64_to_i64(limit)], + |row| row.get(0), + )? + } + None => { + const SQL: &str = " + SELECT COUNT(*) FROM ( + SELECT 1 FROM valid_sequenced_l2_txs + WHERE offset > ?1 + LIMIT ?2 + )"; + self.conn.query_row( + SQL, + params![u64_to_i64(from_offset), u64_to_i64(limit)], + |row| row.get(0), + )? + } + }; + Ok(i64_to_u64(value)) + } +} diff --git a/sequencer/src/storage/ingress.rs b/sequencer/src/storage/ingress.rs new file mode 100644 index 0000000..72a2b30 --- /dev/null +++ b/sequencer/src/storage/ingress.rs @@ -0,0 +1,547 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Inclusion-lane writer: opens the initial batch/frame, appends user-op chunks, +//! and rotates frame/batch boundaries on the hot path. +//! +//! The lane also reads `safe_inputs` (executed by the application) and the open +//! state (resumed on startup) — those reads live here too because they're driven +//! by the lane's flow, not by an L1 ingress event. + +use alloy_primitives::Address; +use rusqlite::{Result, Transaction, params}; + +use super::convert::{from_unix_ms, i64_to_u64, now_unix_ms, to_unix_ms, u64_to_i64}; +use super::mutations::{ + insert_new_batch, insert_open_frame, persist_frame_direct_sequence, seal_batch, +}; +use super::queries::{ + current_safe_block_required, load_current_write_head, query_batch_policy, + query_latest_safe_input_index_exclusive, +}; +use super::{ + BatchPolicy, SafeInputFrontier, SafeInputRange, Storage, StoredSafeInput, WriteHead, + batch_size_target_bytes, +}; +use crate::ingress::inclusion_lane::PendingUserOp; + +impl Storage { + /// Cursor for the next safe input to drain into a frame. Reads the highest + /// already-drained `safe_input_index` from the valid (non-invalidated) + /// `sequenced_l2_txs` rows and returns `MAX + 1` (or 0 if none). + /// + /// Using `MAX + 1` instead of `COUNT(*)` makes this robust against gaps: + /// when a batch is invalidated, those rows drop out of the view and the + /// cursor naturally rewinds, allowing the recovery batch to re-drain. + pub fn next_undrained_safe_input_index(&mut self) -> Result { + const SQL: &str = " + SELECT COALESCE(MAX(safe_input_index) + 1, 0) + FROM valid_sequenced_l2_txs + WHERE safe_input_index IS NOT NULL + "; + let value: i64 = self.conn.query_row(SQL, [], |row| row.get(0))?; + Ok(i64_to_u64(value)) + } + + /// Resume the lane on startup. Returns `None` if storage is empty (caller + /// should follow up with [`Storage::initialize_open_state`]). + pub fn open_state(&mut self) -> Result> { + self.read(load_current_write_head) + } + + /// Bootstrap the very first batch + frame. Asserts that no open state + /// exists; call only when [`Storage::open_state`] returns `None`. + pub fn initialize_open_state( + &mut self, + safe_block: u64, + leading_direct_range: SafeInputRange, + ) -> Result { + self.write(|tx| { + assert!( + load_current_write_head(tx)?.is_none(), + "open state already exists" + ); + + let now_ms = now_unix_ms(); + let policy = query_batch_policy(tx)?; + // Genesis: explicit batch_index = 0, parent = None, nonce = 0. + insert_new_batch(tx, Some(0), None, now_ms)?; + insert_open_frame(tx, 0, 0, now_ms, policy.recommended_fee, safe_block)?; + persist_frame_direct_sequence(tx, 0, 0, leading_direct_range)?; + + Ok(WriteHead { + batch_index: 0, + batch_created_at: from_unix_ms(now_ms), + frame_fee: policy.recommended_fee, + safe_block, + batch_user_op_count: 0, + open_frame_user_op_count: 0, + frame_in_batch: 0, + max_batch_user_op_bytes: batch_size_target_bytes(policy), + }) + }) + } + + /// Snapshot the current L1 view: safe block + exclusive safe-input cursor. + /// The lane uses this to decide whether to advance. + /// + /// **Precondition:** at least one safe-head observation must have been + /// recorded. The lane only starts after `run_preemptive_recovery` + /// completes, which guarantees this in production. + pub fn safe_input_frontier(&mut self) -> Result { + self.read(|tx| { + Ok(SafeInputFrontier { + safe_block: current_safe_block_required(tx)?, + end_exclusive: query_latest_safe_input_index_exclusive(tx)?, + }) + }) + } + + /// Replace `out`'s contents with the safe-input rows in `range`. Asserts + /// contiguity — gaps in `safe_input_index` are a bug, not a runtime + /// condition. + pub fn fill_safe_inputs( + &mut self, + range: SafeInputRange, + out: &mut Vec, + ) -> Result<()> { + out.clear(); + if range.is_empty() { + return Ok(()); + } + + const SQL: &str = " + SELECT safe_input_index, sender, payload, block_number + FROM safe_inputs + WHERE safe_input_index >= ?1 AND safe_input_index < ?2 + ORDER BY safe_input_index ASC + "; + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map( + params![u64_to_i64(range.start()), u64_to_i64(range.end())], + |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, Vec>(1)?, + row.get::<_, Vec>(2)?, + row.get::<_, i64>(3)?, + )) + }, + )?; + + let mut fetched_count = 0_u64; + for (offset, row) in rows.enumerate() { + let (index_i64, sender, payload, block_number_i64) = row?; + let index = i64_to_u64(index_i64); + let expected = range.start().saturating_add(offset as u64); + + assert_eq!( + index, expected, + "non-contiguous safe-input index: expected {expected}, found {index}" + ); + + out.push(StoredSafeInput { + sender: Address::from_slice(sender.as_slice()), + payload, + block_number: i64_to_u64(block_number_i64), + }); + fetched_count = fetched_count.saturating_add(1); + } + + assert_eq!( + range.start().saturating_add(fetched_count), + range.end(), + "safe-input range {range:?} not fully populated" + ); + + Ok(()) + } + + /// Persist a chunk of user ops into the open frame and bump `head`'s + /// counters. + /// + /// `head` is treated as authoritative: the lane is the only writer of + /// open-frame state, so a stale `WriteHead` indicates a bug in the lane, + /// not a runtime condition. The schema's FK + PK constraints catch the + /// dangerous failure modes (write to a non-existent frame, duplicate + /// `pos_in_frame`) by failing the INSERT. + pub fn append_user_ops_chunk( + &mut self, + head: &mut WriteHead, + user_ops: &[PendingUserOp], + ) -> Result<()> { + if user_ops.is_empty() { + return Ok(()); + } + self.write(|tx| { + insert_user_ops_batch( + tx, + head.batch_index, + head.frame_in_batch, + head.open_frame_user_op_count, + user_ops, + ) + })?; + head.increment_batch_user_op_count(user_ops.len()); + Ok(()) + } + + /// Rotate to the next frame inside the same batch. Used when the safe + /// block advances but batch policy hasn't triggered a batch close — the + /// new frame inherits the batch and gets a fresh fee/safe-block. + pub fn close_frame_only( + &mut self, + head: &mut WriteHead, + next_safe_block: u64, + leading_direct_range: SafeInputRange, + ) -> Result<()> { + let policy = self.write(|tx| { + let now_ms = now_unix_ms(); + let policy = query_batch_policy(tx)?; + let next_frame_in_batch = head.frame_in_batch.saturating_add(1); + insert_open_frame( + tx, + head.batch_index, + next_frame_in_batch, + now_ms, + policy.recommended_fee, + next_safe_block, + )?; + persist_frame_direct_sequence( + tx, + head.batch_index, + next_frame_in_batch, + leading_direct_range, + )?; + Ok(policy) + })?; + head.advance_frame(policy, next_safe_block); + Ok(()) + } + + /// Close the current batch and open a fresh one with its first frame. + /// Used when batch policy (size/deadline) triggers a batch close. + /// + /// Atomically: seal the current Tip (sets `sealed_at_ms`), insert the new + /// Tip with `parent_batch_index = head.batch_index`, open its first frame. + /// Order matters: sealing first removes the old row from the + /// `ux_single_valid_tip` partial index, making room for the new Tip. + pub fn close_frame_and_batch( + &mut self, + head: &mut WriteHead, + next_safe_block: u64, + ) -> Result<()> { + let (next_batch_index, now_ms, policy) = self.write(|tx| { + let now_ms = now_unix_ms(); + // Batch policy is sampled here: the derived fee is committed to the newly + // opened frame, and the batch size target is stored on the write head. + let policy = query_batch_policy(tx)?; + seal_batch(tx, head.batch_index, now_ms)?; + let next_batch_index = insert_new_batch(tx, None, Some(head.batch_index), now_ms)?; + insert_open_frame( + tx, + next_batch_index, + 0, + now_ms, + policy.recommended_fee, + next_safe_block, + )?; + Ok((next_batch_index, now_ms, policy)) + })?; + head.move_to_next_batch( + next_batch_index, + from_unix_ms(now_ms), + policy, + next_safe_block, + ); + Ok(()) + } + + pub fn batch_policy(&mut self) -> Result { + query_batch_policy(&self.conn) + } +} + +/// Insert user ops into `user_ops`. The `trg_sequence_user_op` trigger then +/// appends the matching `sequenced_l2_txs` row for each insert. +fn insert_user_ops_batch( + tx: &Transaction<'_>, + batch_index: u64, + frame_in_batch: u32, + frame_pos_start: u32, + user_ops: &[PendingUserOp], +) -> Result<()> { + if user_ops.is_empty() { + return Ok(()); + } + let mut stmt = tx.prepare_cached( + "INSERT INTO user_ops ( + batch_index, frame_in_batch, pos_in_frame, + sender, nonce, max_fee, data, sig, received_at_ms + ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)", + )?; + for (offset, item) in user_ops.iter().enumerate() { + let pos_in_frame = frame_pos_start.saturating_add(offset as u32); + let sig = item.signed.signature.as_bytes(); + stmt.execute(params![ + u64_to_i64(batch_index), + i64::from(frame_in_batch), + i64::from(pos_in_frame), + item.signed.sender.as_slice(), + i64::from(item.signed.user_op.nonce), + i64::from(item.signed.user_op.max_fee), + item.signed.user_op.data.as_ref(), + &sig[..], + to_unix_ms(item.received_at), + ])?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::storage::{ + SafeInputRange, Storage, StoredSafeInput, + test_helpers::{SENDER_A, default_protocol_timing, temp_db}, + }; + use alloy_primitives::Address; + use sequencer_core::l2_tx::SequencedL2Tx; + + #[test] + fn open_state_is_idempotent_and_rotation_is_atomic() { + let db = temp_db("open-state"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + assert!( + storage.open_state().expect("load open state").is_none(), + "fresh storage should not have an open frame yet" + ); + + let head_a = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let head_b = storage + .open_state() + .expect("load existing open state") + .expect("open state should now exist"); + + assert_eq!(head_a.batch_index, head_b.batch_index); + assert_eq!(head_a.frame_in_batch, head_b.frame_in_batch); + assert_eq!(head_a.frame_fee, head_b.frame_fee); + // Default log_recommended_fee = 0+20+419+621 = 1060 + assert_eq!(head_a.frame_fee, 1060); + + let mut head_c = head_b; + let next_safe_block = head_c.safe_block; + storage + .close_frame_only(&mut head_c, next_safe_block, SafeInputRange::empty_at(0)) + .expect("rotate within same batch"); + assert_eq!(head_c.batch_index, head_b.batch_index); + assert_eq!(head_c.frame_in_batch, 1); + + let mut head_d = head_c; + let next_safe_block = head_d.safe_block; + storage + .close_frame_and_batch(&mut head_d, next_safe_block) + .expect("close batch and rotate"); + assert!(head_d.batch_index > head_c.batch_index); + assert_eq!(head_d.frame_in_batch, 0); + } + + #[test] + fn next_frame_fee_comes_from_batch_policy() { + let db = temp_db("batch-policy-fee"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let policy = storage.batch_policy().expect("default policy"); + // Default: log_gas_price=0, log_recommended_fee = 0+20+419+621 = 1060 + assert_eq!(policy.recommended_fee, 1060); + + storage.set_log_gas_price(100).expect("set log gas price"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let next_safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe_block) + .expect("rotate batch"); + + let policy = storage.batch_policy().expect("read policy"); + // log_recommended_fee = 100+20+419+621 = 1160 + assert_eq!(head.frame_fee, 1160); + assert_eq!(head.frame_fee, policy.recommended_fee); + assert!( + head.max_batch_user_op_bytes > 0, + "batch size target should be set" + ); + } + + #[test] + fn frame_fee_is_immutable_for_the_lifetime_of_the_frame() { + // : once a frame is opened at fee F, a policy update mid-frame + // must NOT change the open frame's committed fee. Only the *next* + // frame (after close) sees the new policy. This pins the write-once + // contract `frames.fee` relies on — users submitting against the open + // frame know the fee they're paying, regardless of upstream policy + // drift during their round-trip. + let db = temp_db("frame-fee-immutable"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let original_batch_index = head.batch_index; + let original_frame_in_batch = head.frame_in_batch; + // Default: log_gas_price=0 → log_recommended_fee = 0+20+419+621 = 1060 + assert_eq!(head.frame_fee, 1060); + + // Simulate an operator policy update mid-frame: fee oracle reports a + // higher gas price. The derived view reflects the new fee immediately. + storage + .set_log_gas_price(100) + .expect("set higher log gas price"); + let new_policy = storage.batch_policy().expect("read updated policy"); + assert_eq!( + new_policy.recommended_fee, 1160, + "policy-derived fee should reflect the new gas price", + ); + + // Invariant: the already-open frame's persisted fee stays at 1060. + let persisted_frame_fee: i64 = storage + .conn + .query_row( + "SELECT fee FROM frames WHERE batch_index = ?1 AND frame_in_batch = ?2", + rusqlite::params![original_batch_index as i64, original_frame_in_batch as i64,], + |row| row.get(0), + ) + .expect("query open frame fee"); + assert_eq!( + persisted_frame_fee, 1060, + "open frame's committed fee must not change across policy updates", + ); + + // And the in-memory WriteHead mirror must also be stable — the lane + // submitting against this head should see a consistent fee. + assert_eq!( + head.frame_fee, 1060, + "WriteHead.frame_fee must stay stable until advance_frame runs", + ); + + // Closing the frame picks up the new policy — the *next* frame opens + // at 1160. This is the expected policy-flow boundary. + let next_safe_block = head.safe_block; + storage + .close_frame_only(&mut head, next_safe_block, SafeInputRange::empty_at(0)) + .expect("rotate within same batch"); + assert_eq!( + head.frame_fee, 1160, + "the next frame must use the updated policy's fee (policy flows in at close)", + ); + } + + #[test] + fn next_undrained_safe_input_index_is_derived_from_sequenced_directs() { + let db = temp_db("safe-cursor"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + assert_eq!( + storage + .next_undrained_safe_input_index() + .expect("empty cursor"), + 0 + ); + + let head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let drained = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0x00], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0x02], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, drained.as_slice(), SENDER_A, &default_protocol_timing()) + .expect("insert direct inputs"); + let mut head = head; + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) + .expect("close frame with directs"); + + assert_eq!( + storage + .next_undrained_safe_input_index() + .expect("derived cursor"), + 2 + ); + } + + #[test] + fn initialize_open_state_creates_first_real_batch_and_frame() { + let db = temp_db("initialize-open-state"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let head = storage + .initialize_open_state(12, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + assert_eq!(head.batch_index, 0); + assert_eq!(head.frame_in_batch, 0); + assert_eq!(head.safe_block, 12); + + let loaded = storage + .open_state() + .expect("load open state") + .expect("open state should exist"); + assert_eq!(loaded.batch_index, 0); + assert_eq!(loaded.frame_in_batch, 0); + assert_eq!(loaded.safe_block, 12); + } + + #[test] + fn replay_returns_direct_inputs_in_drain_order() { + let db = temp_db("replay-order"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + let drained = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, drained.as_slice(), SENDER_A, &default_protocol_timing()) + .expect("insert direct inputs"); + let mut head = head; + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) + .expect("close frame with directs"); + + let replay = storage + .ordered_l2_txs_page_from(0, 100) + .expect("load replay"); + assert_eq!(replay.len(), 2); + match &replay[0].1 { + SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xaa]), + _ => panic!("expected direct input at position 0"), + } + match &replay[1].1 { + SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xbb]), + _ => panic!("expected direct input at position 1"), + } + } +} diff --git a/sequencer/src/storage/l1_inputs.rs b/sequencer/src/storage/l1_inputs.rs new file mode 100644 index 0000000..97344e9 --- /dev/null +++ b/sequencer/src/storage/l1_inputs.rs @@ -0,0 +1,397 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Input reader writer: ingests L1 InputBox events into `safe_inputs`, +//! advances `l1_safe_head`, and pins the deployment identity. +//! +//! Also exposes the read-side queries the input reader and other callers need +//! (current safe block, safe-input bounds, last safe-progress timestamp). + +use alloy_primitives::Address; +use rusqlite::{OptionalExtension, Result, Transaction, params}; + +use super::Storage; +use super::convert::{i64_to_u64, now_unix_ms, u64_to_i64}; +use super::queries::{ + current_safe_block, current_safe_block_timestamp, last_safe_progress_ms, + query_latest_safe_input_index_exclusive, +}; +use super::safe_accepted_batches::populate_safe_accepted_batches; +use super::{DeploymentIdentity, StoredSafeInput}; +use sequencer_core::protocol::ProtocolTiming; + +impl Storage { + /// `MAX(safe_input_index) + 1` (or 0 if empty). The exclusive bound on the + /// `safe_inputs` table — the next index a fresh row would receive. + pub fn safe_input_end_exclusive(&mut self) -> Result { + query_latest_safe_input_index_exclusive(&self.conn) + } + + pub fn current_safe_block(&mut self) -> Result> { + current_safe_block(&self.conn) + } + + pub fn current_safe_block_timestamp(&mut self) -> Result> { + current_safe_block_timestamp(&self.conn) + } + + /// Atomically: insert `inputs` (assigned contiguous indexes starting from + /// the current MAX+1), advance `l1_safe_head.block_number` to `safe_block`, + /// stamp `synced_at_ms` as the wall-clock time when the safe frontier + /// advanced, and update `safe_accepted_batches` via `protocol` so the + /// scheduler-accepted frontier view stays consistent with the safe head. + /// + /// The materialized `safe_accepted_batches` view is an invariant of this + /// operation: after a successful `append_safe_inputs`, every safe input up + /// to `safe_block` has been evaluated against the scheduler's acceptance + /// rules and recorded in `safe_accepted_batches`. Readers (submitter, + /// recovery, danger checks) never need to populate separately. + /// + /// Asserts `safe_block` is monotonic and that it strictly advances when + /// `inputs` is non-empty. + pub fn append_safe_inputs( + &mut self, + safe_block: u64, + inputs: &[StoredSafeInput], + batch_submitter: Address, + timing: &ProtocolTiming, + ) -> Result<()> { + self.append_safe_inputs_with_timestamp( + safe_block, + i64_to_u64(now_unix_ms()) / 1000, + inputs, + batch_submitter, + timing, + ) + } + + /// Same as [`Storage::append_safe_inputs`], but records the L1 timestamp + /// of `safe_block`. Production input-reader code should use this path; + /// the shorter helper exists for tests that only need a fresh synthetic + /// safe head. + pub fn append_safe_inputs_with_timestamp( + &mut self, + safe_block: u64, + safe_block_timestamp: u64, + inputs: &[StoredSafeInput], + batch_submitter: Address, + timing: &ProtocolTiming, + ) -> Result<()> { + self.write(|tx| { + if let Some(current) = current_safe_block(tx)? { + assert!( + safe_block >= current, + "safe block regressed: current={current}, next={safe_block}" + ); + assert!( + safe_block > current || inputs.is_empty(), + "safe block must advance when appending new safe inputs" + ); + } + + let next_index = query_latest_safe_input_index_exclusive(tx)?; + insert_safe_inputs_batch(tx, next_index, inputs)?; + + let changed = tx.execute( + "INSERT INTO l1_safe_head \ + (singleton_id, block_number, block_timestamp, synced_at_ms) \ + VALUES (0, ?1, ?2, ?3) \ + ON CONFLICT(singleton_id) DO UPDATE SET \ + block_number = excluded.block_number, \ + block_timestamp = excluded.block_timestamp, \ + synced_at_ms = excluded.synced_at_ms", + params![ + u64_to_i64(safe_block), + u64_to_i64(safe_block_timestamp), + now_unix_ms() + ], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + + populate_safe_accepted_batches(tx, batch_submitter, timing) + }) + } + + /// Wall-clock timestamp (Unix ms) of the last observed safe-head advance, + /// or `None` if no real safe-head observation has occurred yet. + pub fn last_safe_progress_ms(&self) -> Result> { + last_safe_progress_ms(&self.conn) + } + + /// Read the deployment identity this DB is pinned to. Returns `None` on + /// first startup, before L1 bootstrap has discovered the InputBox stream. + pub fn deployment_identity(&self) -> Result> { + query_deployment_identity(&self.conn) + } + + /// Whether this DB already contains deployment-bound state. Used to avoid + /// silently pinning an old, non-empty DB that predates `deployment_identity` + /// to whatever config happens to start it next. + pub fn has_persisted_deployment_state(&self) -> Result { + let present: i64 = self.conn.query_row( + "SELECT \ + EXISTS(SELECT 1 FROM batches) OR \ + EXISTS(SELECT 1 FROM safe_inputs) OR \ + EXISTS(SELECT 1 FROM l1_safe_head)", + [], + |row| row.get(0), + )?; + Ok(present != 0) + } + + /// Insert `identity` on first startup, or return the already-persisted + /// identity on later startups. The caller compares the returned value with + /// its configured/discovered identity and refuses on mismatch. + pub fn load_or_insert_deployment_identity( + &mut self, + identity: DeploymentIdentity, + ) -> Result { + self.write(|tx| { + if let Some(existing) = query_deployment_identity(tx)? { + return Ok(existing); + } + + let changed = tx.execute( + "INSERT INTO deployment_identity \ + (singleton_id, chain_id, app_address, input_box_address, \ + input_box_genesis_block, batch_submitter_address) \ + VALUES (0, ?1, ?2, ?3, ?4, ?5)", + params![ + u64_to_i64(identity.chain_id), + identity.app_address.as_slice(), + identity.input_box_address.as_slice(), + u64_to_i64(identity.input_box_genesis_block), + identity.batch_submitter_address.as_slice(), + ], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(identity) + }) + } +} + +fn query_deployment_identity(conn: &rusqlite::Connection) -> Result> { + conn.query_row( + "SELECT chain_id, app_address, input_box_address, \ + input_box_genesis_block, batch_submitter_address \ + FROM deployment_identity WHERE singleton_id = 0", + [], + |row| { + Ok(DeploymentIdentity { + chain_id: i64_to_u64(row.get::<_, i64>(0)?), + app_address: Address::from_slice(&row.get::<_, Vec>(1)?), + input_box_address: Address::from_slice(&row.get::<_, Vec>(2)?), + input_box_genesis_block: i64_to_u64(row.get::<_, i64>(3)?), + batch_submitter_address: Address::from_slice(&row.get::<_, Vec>(4)?), + }) + }, + ) + .optional() +} + +fn insert_safe_inputs_batch( + tx: &Transaction<'_>, + start_index: u64, + inputs: &[StoredSafeInput], +) -> Result<()> { + if inputs.is_empty() { + return Ok(()); + } + let mut stmt = tx.prepare_cached( + "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) \ + VALUES (?1, ?2, ?3, ?4)", + )?; + for (offset, input) in inputs.iter().enumerate() { + stmt.execute(params![ + u64_to_i64(start_index.saturating_add(offset as u64)), + input.sender.as_slice(), + input.payload.as_slice(), + u64_to_i64(input.block_number), + ])?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::storage::{ + DeploymentIdentity, SafeInputRange, Storage, StoredSafeInput, + test_helpers::{SENDER_A, SENDER_B, default_protocol_timing, temp_db}, + }; + use alloy_primitives::Address; + + fn identity() -> DeploymentIdentity { + DeploymentIdentity { + chain_id: 31337, + app_address: Address::repeat_byte(0x11), + input_box_address: Address::repeat_byte(0x22), + input_box_genesis_block: 42, + batch_submitter_address: SENDER_A, + } + } + + #[test] + fn safe_input_api_uses_half_open_intervals() { + let db = temp_db("safe-input-api"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_protocol_timing(); + + assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 0); + let mut out = Vec::new(); + storage + .fill_safe_inputs(SafeInputRange::new(0, 0), &mut out) + .expect("query empty interval"); + assert!(out.is_empty()); + + let inserted = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xa0], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xb1], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, inserted.as_slice(), SENDER_A, &protocol) + .expect("insert safe directs"); + + assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 2); + + storage + .fill_safe_inputs(SafeInputRange::new(0, 2), &mut out) + .expect("query full interval"); + assert_eq!(out, inserted); + + storage + .fill_safe_inputs(SafeInputRange::new(1, 1), &mut out) + .expect("query empty half-open interval"); + assert!(out.is_empty()); + } + + #[test] + fn new_db_has_no_observed_safe_head() { + let db = temp_db("new-db-no-safe-head"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + assert_eq!( + storage.current_safe_block().expect("read safe block"), + None, + "fresh storage should not pretend to have observed L1" + ); + assert_eq!( + storage + .current_safe_block_timestamp() + .expect("read block timestamp"), + None, + "fresh storage should not have a safe block timestamp" + ); + assert_eq!( + storage + .last_safe_progress_ms() + .expect("read sync timestamp"), + None, + "fresh storage should not have a safe-progress timestamp" + ); + } + + #[test] + fn deployment_identity_is_inserted_once() { + let db = temp_db("deployment-identity-insert-once"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let first = identity(); + + assert_eq!( + storage.deployment_identity().expect("read empty identity"), + None + ); + assert_eq!( + storage + .load_or_insert_deployment_identity(first) + .expect("insert identity"), + first + ); + assert_eq!( + storage + .deployment_identity() + .expect("read persisted identity"), + Some(first) + ); + + let changed = DeploymentIdentity { + batch_submitter_address: SENDER_B, + ..first + }; + assert_eq!( + storage + .load_or_insert_deployment_identity(changed) + .expect("load existing identity"), + first, + "identity must be pinned after the first insert" + ); + assert_eq!( + storage + .deployment_identity() + .expect("read persisted identity"), + Some(first) + ); + } + + #[test] + fn append_safe_inputs_creates_and_advances_safe_head() { + let db = temp_db("append-safe-inputs-creates-safe-head"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_protocol_timing(); + + storage + .append_safe_inputs_with_timestamp(7, 1234, &[], SENDER_A, &protocol) + .expect("record first real safe-head observation"); + assert_eq!( + storage.current_safe_block().expect("read safe block"), + Some(7), + "append should create the safe-head row" + ); + let recorded_sync = storage + .last_safe_progress_ms() + .expect("read sync timestamp") + .expect("first observation should record wall-clock time"); + assert_eq!( + storage + .current_safe_block_timestamp() + .expect("read block timestamp"), + Some(1234), + "first observation should record the L1 safe block timestamp" + ); + + storage + .append_safe_inputs_with_timestamp(9, 5678, &[], SENDER_A, &protocol) + .expect("advance safe head"); + assert_eq!( + storage.current_safe_block().expect("read safe block"), + Some(9), + "append should advance the safe-head row" + ); + assert_eq!( + storage + .current_safe_block_timestamp() + .expect("read advanced block timestamp"), + Some(5678), + "append should record the observed L1 block timestamp" + ); + assert!( + storage + .last_safe_progress_ms() + .expect("read sync timestamp") + .expect("advanced observation should record wall-clock time") + >= recorded_sync, + "safe-progress timestamp should stay monotonic across appends" + ); + } +} diff --git a/sequencer/src/storage/l1_submission.rs b/sequencer/src/storage/l1_submission.rs new file mode 100644 index 0000000..17b7404 --- /dev/null +++ b/sequencer/src/storage/l1_submission.rs @@ -0,0 +1,985 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Batch-aggregate reads: frontier lookup, per-batch frames + user ops, the +//! catch-up / per-batch replay reader, and the SSZ-encoded pending-batch list +//! the submitter pulls each tick. +//! +//! Despite the historical name, nothing in this file does writes — structural +//! nonces are assigned by the `batches.nonce` trigger at close time (see +//! `ingress`), and `safe_accepted_batches` is maintained by `append_safe_inputs` +//! (see `l1_inputs`). The reads here are shared between the batch submitter +//! (hot-path tick) and the egress replay path (catch-up reader); they live +//! together because they all aggregate at the batch level. + +use rusqlite::{Result, params}; + +use super::Storage; +use super::convert::{i64_to_u16, i64_to_u32, i64_to_u64, u64_to_i64}; +use super::queries::{current_safe_block_required, decode_l2_tx_row}; +use super::safe_accepted_batches::frontier_nonce; +use super::{FrameHeader, PendingBatch, SubmitterFrontier}; +use sequencer_core::batch::{Batch, Frame as BatchFrame, WireUserOp}; +use sequencer_core::l2_tx::SequencedL2Tx; + +impl Storage { + /// Read-only frontier view used by the submitter each tick to derive the + /// next batch nonce. `accepted_next_nonce` is the next nonce the scheduler + /// is expected to accept, derived from `safe_accepted_batches`. + /// + /// The scheduler-accepted frontier is maintained by + /// [`Storage::append_safe_inputs`], so this is a pure read. + /// + /// **Precondition:** at least one safe-head observation must have been + /// recorded (via [`Storage::append_safe_inputs`]). In production this is + /// always true because `run_preemptive_recovery` either syncs L1 first + /// or refuses to boot via `L1ViewStale`. Tests must seed an observation + /// explicitly; calling against a fresh DB returns `QueryReturnedNoRows`. + pub fn submitter_frontier(&mut self) -> Result { + self.read(|tx| { + Ok(SubmitterFrontier { + safe_block: current_safe_block_required(tx)?, + accepted_next_nonce: frontier_nonce(tx)?, + }) + }) + } + + /// Highest valid (non-invalidated) `batch_index`, or `None` if no valid + /// batches exist. The open batch is included. + pub fn latest_batch_index(&mut self) -> Result> { + let value: Option = + self.conn + .query_row("SELECT MAX(batch_index) FROM valid_batches", [], |row| { + row.get(0) + })?; + Ok(value.map(i64_to_u64)) + } + + /// Frame headers for `batch_index` in `frame_in_batch` order. Reads the + /// raw `frames` table — does NOT filter on validity, since callers only + /// reach this method after they already know the batch is valid. + pub fn frames_for_batch(&mut self, batch_index: u64) -> Result> { + let mut stmt = self.conn.prepare_cached( + "SELECT frame_in_batch, fee, safe_block FROM frames \ + WHERE batch_index = ?1 ORDER BY frame_in_batch ASC", + )?; + let rows = stmt.query_map(params![u64_to_i64(batch_index)], |row| { + Ok(FrameHeader { + frame_in_batch: i64_to_u32(row.get(0)?), + fee: i64_to_u16(row.get(1)?), + safe_block: i64_to_u64(row.get(2)?), + }) + })?; + rows.collect::>>() + } + + /// Materialize all sequenced L2 txs in one batch (used by the catch-up / + /// per-batch replay paths). Returns `[]` for invalidated batches. + pub fn ordered_l2_txs_for_batch(&mut self, batch_index: u64) -> Result> { + const SQL: &str = " + SELECT + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, + CASE + WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender + WHEN s.safe_input_index IS NOT NULL THEN d.sender + ELSE NULL + END AS sender, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number + FROM valid_sequenced_l2_txs s + LEFT JOIN user_ops u + ON u.batch_index = s.batch_index + AND u.frame_in_batch = s.frame_in_batch + AND u.pos_in_frame = s.user_op_pos_in_frame + LEFT JOIN frames f + ON f.batch_index = s.batch_index + AND f.frame_in_batch = s.frame_in_batch + LEFT JOIN safe_inputs d + ON d.safe_input_index = s.safe_input_index + WHERE s.batch_index = ?1 + ORDER BY s.offset ASC + "; + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map(params![u64_to_i64(batch_index)], |row| { + Ok(decode_l2_tx_row( + row.get(0)?, + row.get(1)?, + row.get(2)?, + row.get(3)?, + row.get(4)?, + row.get(5)?, + )) + })?; + rows.collect::>>() + } + + /// Load all valid closed batches with nonce >= `min_nonce`, in nonce order, + /// each one fully assembled and SSZ-encoded with its authoritative nonce. + /// + /// Authoritative because the nonce stamped into the wire payload is the + /// one the DB persists on the batch row (via the `parent.nonce + 1` + /// structural invariant). The caller never sees an unstamped batch — + /// there is no way to accidentally encode with the wrong nonce. + pub fn pending_batches(&mut self, min_nonce: u64) -> Result> { + const SQL: &str = "SELECT batch_index, nonce FROM valid_closed_batches \ + WHERE nonce >= ?1 ORDER BY nonce ASC"; + let pending_refs: Vec<(u64, u64)> = { + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map(params![u64_to_i64(min_nonce)], |row| { + let bi: i64 = row.get(0)?; + let nonce: i64 = row.get(1)?; + Ok((i64_to_u64(bi), i64_to_u64(nonce))) + })?; + rows.collect::>>()? + }; + + let mut batches = Vec::with_capacity(pending_refs.len()); + for (batch_index, nonce) in pending_refs { + let frames = self.load_batch_frames(batch_index)?; + let batch = Batch { nonce, frames }; + let encoded = ssz::Encode::as_ssz_bytes(&batch); + batches.push(PendingBatch { + batch_index, + nonce, + encoded, + }); + } + Ok(batches) + } + + /// Load every frame (header + user ops) of `batch_index` in frame order. + /// Internal helper for [`Self::pending_batches`]; does NOT filter on + /// validity — callers only reach this after they know the batch is valid. + fn load_batch_frames(&mut self, batch_index: u64) -> Result> { + let frame_headers = self.frames_for_batch(batch_index)?; + let mut frames = Vec::with_capacity(frame_headers.len()); + for header in frame_headers { + let mut stmt = self.conn.prepare_cached( + "SELECT nonce, max_fee, data, sig FROM user_ops \ + WHERE batch_index = ?1 AND frame_in_batch = ?2 \ + ORDER BY pos_in_frame ASC", + )?; + let rows = stmt.query_map( + params![u64_to_i64(batch_index), i64::from(header.frame_in_batch)], + |row| { + Ok(WireUserOp { + nonce: i64_to_u32(row.get(0)?), + max_fee: i64_to_u16(row.get(1)?), + data: row.get(2)?, + signature: row.get(3)?, + }) + }, + )?; + let user_ops: Vec = rows.collect::>()?; + frames.push(BatchFrame { + user_ops, + safe_block: header.safe_block, + fee_price: header.fee, + }); + } + Ok(frames) + } +} + +#[cfg(test)] +mod tests { + use super::super::test_helpers::{ + SENDER_A, SENDER_B, seed_closed_batches, seed_safe_inputs_with_batch_nonces, temp_db, + }; + use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; + use alloy_primitives::Address; + use sequencer_core::batch::{Batch, Frame as BatchFrame}; + use sequencer_core::protocol::ProtocolTiming; + + #[test] + fn pending_batches_stamps_authoritative_nonce_into_wire_bytes() { + // The landmine we removed: an earlier `batch_for_submission` returned a + // `Batch { nonce: 0, … }` placeholder, and callers had to remember to + // stamp the real nonce via `encode_for_scheduler_with_nonce`. The new + // `pending_batches` reads the DB-authoritative nonce from + // `valid_closed_batches` and bakes it straight into the SSZ bytes — so + // decoding the payload must round-trip back to that nonce, and the + // frame body must match what storage persisted. + let db = temp_db("pending-batches-nonce-baked-in"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(12, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + // Close batch 0 so it becomes eligible for submission. + storage + .close_frame_and_batch(&mut head, 12) + .expect("close batch 0"); + + let pending = storage.pending_batches(0).expect("load pending batches"); + assert_eq!(pending.len(), 1); + let entry = &pending[0]; + assert_eq!(entry.batch_index, 0); + assert_eq!(entry.nonce, 0, "genesis batch has nonce 0"); + + // The wire bytes must decode back to the authoritative nonce AND the + // frame body storage persisted. + let decoded: Batch = + ssz::Decode::from_ssz_bytes(&entry.encoded).expect("decode pending wire bytes"); + assert_eq!(decoded.nonce, entry.nonce); + assert_eq!(decoded.frames.len(), 1); + let frame = &decoded.frames[0]; + assert!(frame.user_ops.is_empty()); + assert_eq!(frame.safe_block, 12); + // Default log_recommended_fee = 0+20+419+621 = 1060. + assert_eq!(frame.fee_price, 1060); + } + + #[test] + fn batch_level_helpers_expose_latest_index_frames_and_txs() { + let db = temp_db("batch-level-helpers"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // Before initialization there should be no batches. + assert!( + storage + .latest_batch_index() + .expect("query latest batch nonce on empty db") + .is_none() + ); + + // Initialize first batch/frame and append some data. + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + // Close current batch and move to next so batch 0 becomes closed. + let next_safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe_block) + .expect("close batch and rotate"); + + // Latest batch nonce should now be 1 (open), with batch 0 closed. + let latest = storage + .latest_batch_index() + .expect("query latest batch nonce") + .expect("latest batch should exist"); + assert_eq!(latest, 1); + + // Batch 0 should still have at least one frame header. + let frames = storage + .frames_for_batch(0) + .expect("load frames for batch 0"); + assert!(!frames.is_empty()); + + // Ordered L2 txs for batch 0 should be queryable (even if empty). + let txs = storage + .ordered_l2_txs_for_batch(0) + .expect("load l2 txs for batch 0"); + assert!( + txs.is_empty(), + "fresh batch should not have sequenced txs yet" + ); + } + + #[test] + fn closed_batch_becomes_eligible_for_submission_with_assigned_nonce() { + // : closing a batch transitions it from "open Tip" to "eligible + // for L1 submission" — it appears in `valid_closed_batches` with a + // nonce derived from its parent pointer. Pins the submitter's + // contract: open batches are NOT pulled into the submission pipeline, + // and closed batches ARE, at the schema-guaranteed nonce. + let db = temp_db("closed-batch-eligible"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + // Before close: the open batch must not appear in pending-batches. + let pending_before = storage + .pending_batches(0) + .expect("load pending batches (pre-close)"); + assert!( + pending_before.is_empty(), + "open batch must not be eligible for submission: {pending_before:?}", + ); + + // Close batch 0 — this rotates the Tip to batch 1 and seals batch 0. + let safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, safe_block) + .expect("close batch 0"); + + // After close: batch 0 is eligible with nonce 0 (genesis, parent + // NULL → trigger assigns nonce 0). + let pending_after = storage + .pending_batches(0) + .expect("load pending batches (post-close)"); + assert_eq!( + pending_after.len(), + 1, + "exactly one batch should be eligible after the first close", + ); + assert_eq!(pending_after[0].batch_index, 0); + assert_eq!( + pending_after[0].nonce, 0, + "closed batch 0 must carry nonce 0 (genesis, no parent)", + ); + // The new open Tip (batch 1) must NOT be eligible even though it + // exists — eligibility requires sealed_at_ms NOT NULL. + assert!( + pending_after.iter().all(|b| b.batch_index != 1), + "open batch 1 (the new Tip) must not be eligible: {pending_after:?}", + ); + } + + #[test] + fn submitter_frontier_returns_zero_when_no_batches_were_accepted() { + let db = temp_db("submitter-frontier-empty"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .append_safe_inputs(0, &[], SENDER_A, &default_test_protocol()) + .expect("record observed safe head"); + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.safe_block, 0); + assert_eq!(frontier.accepted_next_nonce, 0); + } + + #[test] + fn submitter_frontier_tracks_accepted_prefix() { + let db = temp_db("submitter-frontier-prefix"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + // seed_safe_inputs_with_batch_nonces already calls append_safe_inputs, + // which auto-populates safe_accepted_batches. + seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 3, 4, 5]); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.safe_block, 10); + assert_eq!(frontier.accepted_next_nonce, 2); + } + + fn default_test_protocol() -> ProtocolTiming { + ProtocolTiming { + max_wait_blocks: 1200, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + } + } + + fn unix_now_ms() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 + } + + #[test] + fn check_danger_reports_observed_closed_batch_danger() { + let db = temp_db("check-danger-observed-closed"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 1"); + + let protocol = default_test_protocol(); + storage + .append_safe_inputs( + 1135, + &[StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&Batch { + nonce: 0, + frames: vec![BatchFrame { + user_ops: vec![], + safe_block: 10, + fee_price: 0, + }], + }), + block_number: 20, + }], + SENDER_A, + &protocol, + ) + .expect("append accepted batch 0"); + + let status = storage + .check_danger(&protocol, unix_now_ms()) + .expect("check_danger"); + assert_eq!(status, crate::storage::DangerStatus::ClosedBatchInDanger(1)); + } + + #[test] + fn check_danger_reports_estimated_batch_danger_on_wall_clock_drift() { + // Observed block-based checks wouldn't fire (batch 1 has + // first_frame_safe_block = 100 and safe_block = 1200, age = 1100 < + // 1125). But wall-clock says the safe head hasn't advanced in ~25 + // blocks: the effective threshold drops to 1100, so batch 1 reaches + // danger via the wall-clock correction. + let db = temp_db("check-danger-estimated"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + let protocol = default_test_protocol(); + storage + .append_safe_inputs( + 1200, + &[StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&Batch { + nonce: 0, + frames: vec![BatchFrame { + user_ops: vec![], + safe_block: 100, + fee_price: 0, + }], + }), + block_number: 200, + }], + SENDER_A, + &protocol, + ) + .expect("append accepted batch 0"); + + // Pretend safe-progress was recorded 25 blocks' worth of wall-clock ago. + let now_ms = unix_now_ms(); + storage + .conn + .execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 WHERE singleton_id = 0", + [i64::try_from(now_ms.saturating_sub(25 * 12 * 1000)).unwrap_or(i64::MAX)], + ) + .expect("rewind safe-progress timestamp"); + + let status = storage + .check_danger(&protocol, now_ms) + .expect("check_danger"); + assert_eq!( + status, + crate::storage::DangerStatus::EstimatedBatchInDanger(1) + ); + } + + #[test] + fn check_danger_prefers_observed_closed_over_estimated_when_both_fire() { + // If the observed safe head already puts the closed frontier past + // `danger_threshold`, route to observed closed-batch recovery. The + // wall-clock arm is a fallback, not a mask over observed danger. + let db = temp_db("check-danger-observed-closed-wins"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let protocol = default_test_protocol(); + // Advance safe head so batch 0 is past `danger_threshold` (1125): + // current=1200 - first_frame=10 = 1190 >= 1125. + // No safe_input ingested for batch 0, so it stays non-gold. + storage + .append_safe_inputs(1200, &[], SENDER_A, &protocol) + .expect("advance safe head past observed danger threshold"); + + // Pretend safe-progress was recorded 25 blocks' worth of wall-clock + // ago. With the wall-clock correction, the adjusted threshold is + // 1100 — batch 0's age (1190) also clears that, so wall-clock fires. + let now_ms = unix_now_ms(); + storage + .conn + .execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 WHERE singleton_id = 0", + [i64::try_from(now_ms.saturating_sub(25 * 12 * 1000)).unwrap_or(i64::MAX)], + ) + .expect("rewind safe-progress timestamp"); + + let status = storage + .check_danger(&protocol, now_ms) + .expect("check_danger"); + assert_eq!( + status, + crate::storage::DangerStatus::ClosedBatchInDanger(0), + "observed closed-batch danger must win once safe-state has crossed danger", + ); + } + + #[test] + fn check_danger_prefers_observed_tip_over_estimated_when_both_fire() { + // Same fallback rule for the open Tip: if observed safe-state already + // puts the Tip in danger, recover the Tip directly instead of refusing + // under the wall-clock classification. + let db = temp_db("check-danger-observed-tip-wins"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0; batch 1 is Tip"); + + let protocol = default_test_protocol(); + storage + .append_safe_inputs( + 1200, + &[StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&Batch { + nonce: 0, + frames: vec![BatchFrame { + user_ops: vec![], + safe_block: 10, + fee_price: 0, + }], + }), + block_number: 20, + }], + SENDER_A, + &protocol, + ) + .expect("append accepted batch 0"); + + // The Tip's observed age is 1190 >= 1125, and the wall-clock-adjusted + // arm would also fire. Observed Tip danger must win. + let now_ms = unix_now_ms(); + storage + .conn + .execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 WHERE singleton_id = 0", + [i64::try_from(now_ms.saturating_sub(25 * 12 * 1000)).unwrap_or(i64::MAX)], + ) + .expect("rewind safe-progress timestamp"); + + let status = storage + .check_danger(&protocol, now_ms) + .expect("check_danger"); + assert_eq!( + status, + crate::storage::DangerStatus::TipInDanger(1), + "Tip must win when observed safe-state has already crossed danger", + ); + } + + #[test] + fn check_danger_refuses_when_l1_view_is_stale() { + let db = temp_db("check-danger-l1-view-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let protocol = default_test_protocol(); + let old_safe_timestamp = 1_000_u64; + storage + .append_safe_inputs_with_timestamp(1200, old_safe_timestamp, &[], SENDER_A, &protocol) + .expect("advance safe head with stale L1 timestamp"); + + let now_ms = + (old_safe_timestamp + protocol.l1_read_stale_after_secs()).saturating_mul(1000); + let status = storage + .check_danger(&protocol, now_ms) + .expect("check_danger"); + assert_eq!( + status, + crate::storage::DangerStatus::L1ViewStale, + "global L1 view staleness must refuse before observed batch recovery", + ); + } + + #[test] + fn check_danger_safe_when_never_synced() { + // Fresh DB, no prior safe block timestamp. The L1 view is unusable + // until the input reader records a real safe-head observation. + let db = temp_db("check-danger-never-synced"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let status = storage + .check_danger(&default_test_protocol(), unix_now_ms()) + .expect("check_danger"); + assert_eq!(status, crate::storage::DangerStatus::L1ViewStale); + } + + #[test] + fn populate_safe_accepted_batches_resumes_from_latest_row() { + let db = temp_db("safe-accepted-frontier-resume"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1]); + + // Mixed-sender wave: the SENDER_B row must be ignored, SENDER_A rows + // must resume from the cursor and advance the frontier. + let second_wave = vec![ + StoredSafeInput { + sender: SENDER_B, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 99, + frames: Vec::new(), + }), + block_number: 11, + }, + StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 2, + frames: Vec::new(), + }), + block_number: 11, + }, + StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 3, + frames: Vec::new(), + }), + block_number: 11, + }, + ]; + storage + .append_safe_inputs(11, second_wave.as_slice(), SENDER_A, &protocol) + .expect("append second wave"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.safe_block, 11); + assert_eq!(frontier.accepted_next_nonce, 4); + + let accepted_count: i64 = storage + .conn + .query_row("SELECT COUNT(*) FROM safe_accepted_batches", [], |row| { + row.get(0) + }) + .expect("count accepted rows"); + assert_eq!(accepted_count, 4); + } + + #[test] + fn safe_accepted_frontier_skips_stale_payloads() { + let db = temp_db("safe-accepted-frontier-skip-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + // Seed a non-stale batch with nonce 0 (safe_block=100, block_number=200, max_wait=1200 → not stale) + let non_stale_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 0, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 100, + fee_price: 0, + }], + }); + // Seed a stale batch with nonce 1 (safe_block=100, block_number=2000, max_wait=1200 → stale) + let stale_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 1, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 100, + fee_price: 0, + }], + }); + // Seed a non-stale batch with nonce 1 (safe_block=1900, block_number=2000 → not stale) + let non_stale_payload_2 = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 1, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 1900, + fee_price: 0, + }], + }); + + let inputs = vec![ + StoredSafeInput { + sender: SENDER_A, + payload: non_stale_payload, + block_number: 200, + }, + StoredSafeInput { + sender: SENDER_A, + payload: stale_payload, + block_number: 2000, + }, + StoredSafeInput { + sender: SENDER_A, + payload: non_stale_payload_2, + block_number: 2000, + }, + ]; + storage + .append_safe_inputs(2000, inputs.as_slice(), SENDER_A, &protocol) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.accepted_next_nonce, 2); + } + + #[test] + fn frontier_accepts_future_safe_block_batch_by_design() { + // The scheduler rejects batches where frame safe_block > inclusion_block, + // but the sequencer trusts its own output and does not re-validate these + // invariants during recovery. This test documents the intentional design + // choice: populate_safe_accepted_batches accepts such batches because + // the sequencer would never produce them. + let db = temp_db("frontier-future-safe-block"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let future_safe_block_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 0, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 500, + fee_price: 0, + }], + }); + let non_monotonic_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 1, + frames: vec![ + sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 200, + fee_price: 0, + }, + sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 100, + fee_price: 0, + }, + ], + }); + + let batch_submitter = Address::repeat_byte(0xCC); + let protocol = ProtocolTiming { + max_wait_blocks: u64::MAX, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }; + let inputs = vec![ + StoredSafeInput { + sender: batch_submitter, + payload: future_safe_block_payload, + block_number: 100, + }, + StoredSafeInput { + sender: batch_submitter, + payload: non_monotonic_payload, + block_number: 200, + }, + ]; + storage + .append_safe_inputs(200, inputs.as_slice(), batch_submitter, &protocol) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier.accepted_next_nonce, 2, + "both batches should be in accepted frontier" + ); + } + + #[test] + fn pending_batches_skips_invalidated_and_respects_min_nonce() { + let db = temp_db("load-pending-batches-filter"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + seed_closed_batches(&mut storage, 3); + storage.insert_invalid_batch(1).expect("invalidate batch 1"); + + // From nonce 0: batches 0 and 2 remain valid. + let from_zero = storage + .pending_batches(0) + .expect("load pending batches from 0"); + let nonces: Vec = from_zero.iter().map(|b| b.nonce).collect(); + assert_eq!(nonces, vec![0, 2], "batch 1 must be filtered out"); + + // From nonce 1: only batch 2 remains (batch 0 is below min_nonce). + let from_one = storage + .pending_batches(1) + .expect("load pending batches from 1"); + let nonces: Vec = from_one.iter().map(|b| b.nonce).collect(); + assert_eq!(nonces, vec![2]); + + // Past the suffix: empty. + let from_three = storage + .pending_batches(3) + .expect("load pending batches from 3"); + assert!( + from_three.is_empty(), + "no batch should remain at nonce >= 3" + ); + } + + #[test] + fn nonce_is_reused_after_torn_cascade() { + // After a torn cascade invalidates every batch (including genesis), + // the recovery batch has no valid ancestor. Its parent is NULL, + // so its nonce resets to 0 — effectively reusing the nonce of the + // original genesis. The scheduler's "expected next nonce" also + // resets to 0, since no accepted batches were ever submitted. + let db = temp_db("nonce-reuse-after-torn-cascade"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + storage.insert_invalid_batch(0).expect("invalidate batch 0"); + storage.insert_invalid_batch(1).expect("invalidate batch 1"); + storage + .append_safe_inputs(10, &[], SENDER_A, &default_test_protocol()) + .expect("record observed safe head"); + storage + .recover_post_flush(1200) + .expect("open recovery batch after torn invalidation"); + + let head = storage + .open_state() + .expect("load open state") + .expect("recovery batch"); + assert_eq!(head.batch_index, 2); + + // Recovery Tip has no valid ancestor → parent NULL → nonce 0. + let recovery_nonce: i64 = storage + .conn + .query_row( + "SELECT nonce FROM batches WHERE batch_index = 2", + [], + |row| row.get(0), + ) + .expect("query recovery nonce"); + assert_eq!(recovery_nonce, 0, "recovery Tip reuses nonce 0"); + } + + #[test] + fn populate_safe_accepted_batches_skips_duplicate_nonces() { + let db = temp_db("populate-dup-nonces"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + + storage + .append_safe_inputs( + 20, + &[ + StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(0, 10), + block_number: 20, + }, + StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(0, 10), + block_number: 20, + }, + ], + SENDER_A, + &protocol, + ) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier.accepted_next_nonce, 1, + "duplicate nonce must be skipped" + ); + } + + #[test] + fn populate_safe_accepted_batches_handles_large_nonce_gap() { + let db = temp_db("populate-nonce-gap"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(5, 10), + block_number: 20, + }], + SENDER_A, + &protocol, + ) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.accepted_next_nonce, 0, "gap must stall frontier"); + } + + #[test] + fn populate_safe_accepted_batches_out_of_order_arrivals_stalls_frontier() { + let db = temp_db("populate-out-of-order"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close 2"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(1, 10), + block_number: 20, + }], + SENDER_A, + &protocol, + ) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier.accepted_next_nonce, 0, + "out of order must stall frontier" + ); + + storage + .append_safe_inputs( + 21, + &[StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(0, 10), + block_number: 21, + }], + SENDER_A, + &protocol, + ) + .expect("append nonce 0"); + + let frontier2 = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier2.accepted_next_nonce, 1, + "frontier must remain stalled" + ); + } +} diff --git a/sequencer/src/storage/migrations/0001_schema.sql b/sequencer/src/storage/migrations/0001_schema.sql index 4dde509..4adf171 100644 --- a/sequencer/src/storage/migrations/0001_schema.sql +++ b/sequencer/src/storage/migrations/0001_schema.sql @@ -1,11 +1,133 @@ +-- --------------------------------------------------------------------------- +-- Batch lifecycle +-- +-- A batch has two monotonic events in its lifetime, each stored as a nullable +-- write-once timestamp on the row: +-- +-- * `sealed_at_ms` — inclusion lane closed the batch (no more ops). +-- * `invalidated_at_ms` — recovery cascade-invalidated the batch. +-- +-- NULL means the event hasn't happened. Once set, triggers below make the +-- column write-once. The only "mutable" state on the row is these two NULL→value +-- transitions, each owned by exactly one writer (inclusion lane vs recovery). +-- +-- The **Tip** is the one batch currently accepting ops: sealed_at_ms IS NULL +-- AND invalidated_at_ms IS NULL. A partial unique index enforces at-most-one. +-- +-- `nonce` is structural: equal to `parent.nonce + 1`, or 0 for genesis (parent +-- NULL). Enforced by trigger on INSERT. The scheduler's view of a batch's +-- identity; reused across recovery cascades (new Tip forks from last valid +-- ancestor, inheriting nonce via the +1 rule). +-- --------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS batches ( - batch_index INTEGER PRIMARY KEY, - created_at_ms INTEGER NOT NULL + batch_index INTEGER PRIMARY KEY, + parent_batch_index INTEGER REFERENCES batches(batch_index), -- NULL only for genesis + nonce INTEGER NOT NULL CHECK (nonce >= 0), + created_at_ms INTEGER NOT NULL, + sealed_at_ms INTEGER + CHECK (sealed_at_ms IS NULL OR sealed_at_ms >= created_at_ms), + invalidated_at_ms INTEGER + CHECK (invalidated_at_ms IS NULL OR invalidated_at_ms >= created_at_ms) ); +-- "At most one valid Tip" — structural via partial unique index. The predicate +-- references only local columns of `batches`, so SQLite accepts it. +-- +-- We index on COALESCE(sealed_at_ms, 0) instead of sealed_at_ms directly +-- because SQLite UNIQUE indexes treat NULLs as distinct — so indexing directly +-- on `sealed_at_ms` would allow many NULL rows. COALESCE maps all matching +-- rows to the same non-NULL value (0), forcing real uniqueness. +CREATE UNIQUE INDEX IF NOT EXISTS ux_single_valid_tip + ON batches(COALESCE(sealed_at_ms, 0)) + WHERE sealed_at_ms IS NULL AND invalidated_at_ms IS NULL; + +-- Submitter hot path: "give me valid closed batches with nonce >= N", ordered. +CREATE INDEX IF NOT EXISTS idx_batches_valid_closed_by_nonce + ON batches(nonce) + WHERE invalidated_at_ms IS NULL AND sealed_at_ms IS NOT NULL; + +-- ── Views ────────────────────────────────────────────────────────────────── +CREATE VIEW IF NOT EXISTS valid_batches AS + SELECT * FROM batches WHERE invalidated_at_ms IS NULL; + +CREATE VIEW IF NOT EXISTS valid_closed_batches AS + SELECT * FROM valid_batches WHERE sealed_at_ms IS NOT NULL; + +-- At most one row by the partial unique index above. +CREATE VIEW IF NOT EXISTS valid_open_batch AS + SELECT * FROM valid_batches WHERE sealed_at_ms IS NULL; + +-- ── Triggers ─────────────────────────────────────────────────────────────── +-- +-- These enforce invariants the writer could otherwise violate with a bug. +-- Keep them declarative: each one names an invariant and refuses writes that +-- would break it. The Rust writer is still the source of truth for the +-- transition sequence — triggers just ensure the DB never reaches an +-- inconsistent state if the writer misbehaves. + +-- Nonce contiguity: `nonce = parent.nonce + 1`, or 0 for genesis. +CREATE TRIGGER IF NOT EXISTS trg_enforce_nonce_contiguity +AFTER INSERT ON batches +FOR EACH ROW +BEGIN + SELECT CASE + WHEN NEW.parent_batch_index IS NULL AND NEW.nonce != 0 + THEN RAISE(ABORT, 'genesis batch must have nonce 0') + WHEN NEW.parent_batch_index IS NOT NULL + AND NEW.nonce != (SELECT nonce + 1 FROM batches WHERE batch_index = NEW.parent_batch_index) + THEN RAISE(ABORT, 'batch nonce must equal parent.nonce + 1') + END; +END; + +-- Write-once: sealed_at_ms transitions only NULL → non-NULL. +CREATE TRIGGER IF NOT EXISTS trg_sealed_at_ms_write_once +BEFORE UPDATE OF sealed_at_ms ON batches +FOR EACH ROW +WHEN OLD.sealed_at_ms IS NOT NULL +BEGIN + SELECT RAISE(ABORT, 'sealed_at_ms is write-once'); +END; + +-- Write-once: invalidated_at_ms transitions only NULL → non-NULL. +CREATE TRIGGER IF NOT EXISTS trg_invalidated_at_ms_write_once +BEFORE UPDATE OF invalidated_at_ms ON batches +FOR EACH ROW +WHEN OLD.invalidated_at_ms IS NOT NULL +BEGIN + SELECT RAISE(ABORT, 'invalidated_at_ms is write-once'); +END; + +-- parent_batch_index is immutable after insert. +CREATE TRIGGER IF NOT EXISTS trg_parent_batch_index_immutable +BEFORE UPDATE OF parent_batch_index ON batches +FOR EACH ROW +WHEN (OLD.parent_batch_index IS NULL) != (NEW.parent_batch_index IS NULL) + OR OLD.parent_batch_index IS NOT NULL AND NEW.parent_batch_index IS NOT NULL + AND OLD.parent_batch_index != NEW.parent_batch_index +BEGIN + SELECT RAISE(ABORT, 'parent_batch_index is immutable'); +END; + +-- nonce is immutable after insert. +CREATE TRIGGER IF NOT EXISTS trg_nonce_immutable +BEFORE UPDATE OF nonce ON batches +FOR EACH ROW +WHEN OLD.nonce != NEW.nonce +BEGIN + SELECT RAISE(ABORT, 'nonce is immutable'); +END; + +-- --------------------------------------------------------------------------- +-- Frames and user ops: must target the current Tip. +-- +-- These catch "stale WriteHead" bugs — where a writer holds an in-memory +-- batch_index that's no longer the Tip (sealed or invalidated between reads). +-- A PK lookup per row: microseconds, negligible overhead even on hot paths. +-- --------------------------------------------------------------------------- + CREATE TABLE IF NOT EXISTS frames ( batch_index INTEGER NOT NULL REFERENCES batches(batch_index), - frame_in_batch INTEGER NOT NULL, + frame_in_batch INTEGER NOT NULL CHECK (frame_in_batch >= 0), created_at_ms INTEGER NOT NULL, -- Fee committed by the sequencer for this whole frame. fee INTEGER NOT NULL CHECK (fee >= 0), @@ -14,21 +136,46 @@ CREATE TABLE IF NOT EXISTS frames ( PRIMARY KEY(batch_index, frame_in_batch) ); +CREATE TRIGGER IF NOT EXISTS trg_frames_target_must_be_tip +BEFORE INSERT ON frames +FOR EACH ROW +WHEN NOT EXISTS ( + SELECT 1 FROM batches + WHERE batch_index = NEW.batch_index + AND sealed_at_ms IS NULL + AND invalidated_at_ms IS NULL +) +BEGIN + SELECT RAISE(ABORT, 'frames can only be inserted into the current Tip'); +END; + CREATE TABLE IF NOT EXISTS user_ops ( batch_index INTEGER NOT NULL, frame_in_batch INTEGER NOT NULL, - pos_in_frame INTEGER NOT NULL, - sender BLOB NOT NULL, - nonce INTEGER NOT NULL, - max_fee INTEGER NOT NULL, + pos_in_frame INTEGER NOT NULL CHECK (pos_in_frame >= 0), + sender BLOB NOT NULL CHECK (length(sender) = 20), + nonce INTEGER NOT NULL CHECK (nonce >= 0), + max_fee INTEGER NOT NULL CHECK (max_fee >= 0), data BLOB NOT NULL, - sig BLOB NOT NULL, + sig BLOB NOT NULL CHECK (length(sig) = 65), received_at_ms INTEGER NOT NULL, PRIMARY KEY(batch_index, frame_in_batch, pos_in_frame), - FOREIGN KEY(batch_index, frame_in_batch) REFERENCES frames(batch_index, frame_in_batch), - UNIQUE(sender, nonce) + FOREIGN KEY(batch_index, frame_in_batch) REFERENCES frames(batch_index, frame_in_batch) ); +CREATE TRIGGER IF NOT EXISTS trg_user_ops_target_must_be_tip +BEFORE INSERT ON user_ops +FOR EACH ROW +WHEN NOT EXISTS ( + SELECT 1 FROM batches + WHERE batch_index = NEW.batch_index + AND sealed_at_ms IS NULL + AND invalidated_at_ms IS NULL +) +BEGIN + SELECT RAISE(ABORT, 'user_ops can only be inserted into the current Tip'); +END; + -- Automatically sequence every user-op into the global replay order on insert. -- Note: safe_inputs do NOT have an analogous trigger because their -- batch_index/frame_in_batch are not known at INSERT time — safe inputs @@ -50,6 +197,9 @@ CREATE TABLE IF NOT EXISTS safe_inputs ( block_number INTEGER NOT NULL CHECK (block_number >= 0) ); +CREATE INDEX IF NOT EXISTS idx_safe_inputs_sender + ON safe_inputs(sender); + -- Global append-only replay order consumed by catch-up and feed readers. -- It is a cache, containing the merged and flattened txs of safe_inputs and user_ops. CREATE TABLE IF NOT EXISTS sequenced_l2_txs ( @@ -77,22 +227,74 @@ CREATE TABLE IF NOT EXISTS sequenced_l2_txs ( ), -- At most one sequenced user-op row for each user-op key. - UNIQUE(batch_index, frame_in_batch, user_op_pos_in_frame), - -- A direct input can only be sequenced once. - UNIQUE(safe_input_index) + UNIQUE(batch_index, frame_in_batch, user_op_pos_in_frame) + -- A direct input may be sequenced more than once if its original batch is + -- invalidated and a recovery batch re-drains it. The read-side query filters + -- out rows from invalid batches, so only the latest valid drain is visible. + -- (No UNIQUE constraint on safe_input_index.) ); +CREATE TRIGGER IF NOT EXISTS trg_sequenced_l2_txs_target_must_be_tip +BEFORE INSERT ON sequenced_l2_txs +FOR EACH ROW +WHEN NOT EXISTS ( + SELECT 1 FROM batches + WHERE batch_index = NEW.batch_index + AND sealed_at_ms IS NULL + AND invalidated_at_ms IS NULL +) +BEGIN + SELECT RAISE(ABORT, 'sequenced_l2_txs can only target the current Tip'); +END; + CREATE INDEX IF NOT EXISTS idx_sequenced_l2_txs_frame ON sequenced_l2_txs(batch_index, frame_in_batch); +-- Partial index for efficient MAX(safe_input_index) lookups used to compute +-- the next undrained direct-input cursor at frame-close time. +CREATE INDEX IF NOT EXISTS idx_sequenced_l2_txs_safe_input + ON sequenced_l2_txs(safe_input_index) WHERE safe_input_index IS NOT NULL; + +CREATE VIEW IF NOT EXISTS valid_sequenced_l2_txs AS +SELECT * FROM sequenced_l2_txs +WHERE batch_index NOT IN (SELECT batch_index FROM batches WHERE invalidated_at_ms IS NOT NULL); + +-- Derived log of batch submissions the scheduler would actually execute. +-- Unlike a raw log of all safe submissions, this only contains the accepted +-- prefix: batches whose nonce matched the expected sequence and were not stale. +-- Maintained atomically by Storage::append_safe_inputs (via +-- populate_safe_accepted_batches_inner), which simulates the scheduler's +-- acceptance logic over new safe_inputs rows. +CREATE TABLE IF NOT EXISTS safe_accepted_batches ( + safe_input_index INTEGER PRIMARY KEY REFERENCES safe_inputs(safe_input_index), + nonce INTEGER NOT NULL, + first_frame_safe_block INTEGER NOT NULL, + inclusion_block INTEGER NOT NULL +); + CREATE TABLE IF NOT EXISTS l1_safe_head ( singleton_id INTEGER PRIMARY KEY CHECK (singleton_id = 0), -- Highest L1 safe block the input reader has observed and atomically synced into storage. - block_number INTEGER NOT NULL CHECK (block_number >= 0) + block_number INTEGER NOT NULL CHECK (block_number >= 0), + -- L1 timestamp (Unix seconds) of block_number. + block_timestamp INTEGER NOT NULL CHECK (block_timestamp >= 0), + -- Wall-clock time (Unix ms) of the last successful L1 sync. + -- Used for wall-clock danger estimation when L1 is unreachable. + synced_at_ms INTEGER NOT NULL CHECK (synced_at_ms >= 0) +); + +-- Deployment identity: the persisted DB is only valid for this deployment. +-- Allows L1-unreachable startup after first boot, and prevents interpreting +-- historical sequencer state under a different app or batch-submitter address. +CREATE TABLE IF NOT EXISTS deployment_identity ( + singleton_id INTEGER PRIMARY KEY CHECK (singleton_id = 0), + chain_id INTEGER NOT NULL CHECK (chain_id > 0), + app_address BLOB NOT NULL CHECK (length(app_address) = 20), + input_box_address BLOB NOT NULL CHECK (length(input_box_address) = 20), + input_box_genesis_block INTEGER NOT NULL CHECK (input_box_genesis_block >= 0), + batch_submitter_address BLOB NOT NULL CHECK (length(batch_submitter_address) = 20) ); -INSERT OR IGNORE INTO l1_safe_head (singleton_id, block_number) -VALUES (0, 0); -- --------------------------------------------------------------------------- -- Batch policy singleton diff --git a/sequencer/src/storage/mod.rs b/sequencer/src/storage/mod.rs index c3fb30f..0c966a3 100644 --- a/sequencer/src/storage/mod.rs +++ b/sequencer/src/storage/mod.rs @@ -1,14 +1,52 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) -mod db; -mod sql; +//! SQLite-backed storage for the sequencer. +//! +//! [`Storage`] is the single entry point. Methods are clustered by caller role +//! across sibling files — mostly "one file per writer", plus one read-only +//! batch-aggregate file that two roles share: +//! +//! - `ingress` — inclusion lane: user-op append, frame/batch close +//! - `egress` — WS feed and catch-up replay (read-only) +//! - `l1_inputs` — input reader: safe-input ingestion, L1 head, deployment identity +//! - `l1_submission` — batch-aggregate reads (submitter frontier, pending +//! batches, per-batch replay) shared between the submitter and egress +//! - `recovery` — cascade invalidation, recovery-batch open, danger checks +//! - `admin` — operator policy tunables (gas price, alpha) +//! +//! Cross-writer helpers are split by concern: +//! +//! - `convert` — int width + time conversions +//! - `queries` — shared read helpers (`query_*`, `load_current_write_head`) +//! - `mutations` — shared write helpers (`insert_new_batch`, `seal_batch`, …) +//! +//! The schema and `valid_*` views live in `migrations/0001_schema.sql`. See +//! `docs/recovery/README.md` for the recovery design and TLA+ specs. + +mod admin; +mod convert; +mod egress; +mod ingress; +mod l1_inputs; +mod l1_submission; +mod mutations; +mod open; +mod queries; +mod recovery; +mod safe_accepted_batches; + +#[cfg(test)] +pub(crate) mod test_helpers; use std::time::SystemTime; use thiserror::Error; -pub use db::Storage; +pub use open::Storage; +pub use recovery::DangerStatus; +/// One safe input as stored on the L1 InputBox: sender, opaque payload, and +/// the L1 block where it was included. #[derive(Debug, Clone, PartialEq, Eq)] pub struct StoredSafeInput { pub sender: alloy_primitives::Address, @@ -17,10 +55,16 @@ pub struct StoredSafeInput { pub block_number: u64, } +/// Half-open range `[start, end)` over `safe_input_index` values. Used to +/// describe which safe inputs a frame drained. +/// +/// Fields are private so the `new`-time invariant (`end >= start`) can't be +/// broken by direct mutation. Read via [`start`](Self::start) / +/// [`end`](Self::end); construct via [`new`](Self::new) / [`empty_at`](Self::empty_at). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct SafeInputRange { - pub start_inclusive: u64, - pub end_exclusive: u64, + start_inclusive: u64, + end_exclusive: u64, } impl SafeInputRange { @@ -39,21 +83,76 @@ impl SafeInputRange { Self::new(index, index) } + /// Extend the range forward, producing `[self.end, new_end)`. Panics if + /// `new_end < self.end` — this is the "advance" direction only. pub fn advance_to(self, end_exclusive: u64) -> Self { Self::new(self.end_exclusive, end_exclusive) } + pub fn start(self) -> u64 { + self.start_inclusive + } + + pub fn end(self) -> u64 { + self.end_exclusive + } + pub fn is_empty(self) -> bool { self.start_inclusive == self.end_exclusive } + + /// Split the range into consecutive sub-ranges of at most `max_len` + /// elements. The last chunk may be shorter. Yields nothing if empty. + pub fn chunks(self, max_len: u64) -> SafeInputRangeChunks { + assert!(max_len > 0, "chunk size must be positive"); + SafeInputRangeChunks { + cursor: self.start_inclusive, + end: self.end_exclusive, + max_len, + } + } } +/// Iterator returned by [`SafeInputRange::chunks`]. +pub struct SafeInputRangeChunks { + cursor: u64, + end: u64, + max_len: u64, +} + +impl Iterator for SafeInputRangeChunks { + type Item = SafeInputRange; + + fn next(&mut self) -> Option { + if self.cursor >= self.end { + return None; + } + let chunk_end = self.end.min(self.cursor.saturating_add(self.max_len)); + let chunk = SafeInputRange::new(self.cursor, chunk_end); + self.cursor = chunk_end; + Some(chunk) + } +} + +/// Snapshot of the L1 view: current safe block plus the exclusive cursor into +/// `safe_inputs`. Read by the inclusion lane to decide when to advance. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct SafeFrontier { +pub struct SafeInputFrontier { pub safe_block: u64, pub end_exclusive: u64, } +/// Snapshot of the scheduler-accepted frontier: current safe block plus the +/// next nonce the scheduler is expected to accept. Read by the batch submitter +/// each tick to derive the next unresolved nonce. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SubmitterFrontier { + pub safe_block: u64, + pub accepted_next_nonce: u64, +} + +/// Per-frame metadata: position within batch, committed fee, and the +/// safe-block boundary the frame draws against. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct FrameHeader { pub frame_in_batch: u32, @@ -62,6 +161,31 @@ pub struct FrameHeader { pub safe_block: u64, } +/// A batch ready for L1 submission: its local index, assigned nonce, and SSZ-encoded payload. +#[derive(Debug)] +pub struct PendingBatch { + pub batch_index: u64, + pub nonce: u64, + pub encoded: Vec, +} + +/// Deployment identity the database is bound to. +/// +/// The persisted sequencer state is not portable across these fields: +/// historical safe inputs are classified using the batch-submitter address, +/// user-op signatures use the app/domain identity, and recovery/frontier logic +/// depends on the InputBox stream that began at `input_box_genesis_block`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DeploymentIdentity { + pub chain_id: u64, + pub app_address: alloy_primitives::Address, + pub input_box_address: alloy_primitives::Address, + pub input_box_genesis_block: u64, + pub batch_submitter_address: alloy_primitives::Address, +} + +/// Returned by [`Storage::open`] and friends; either the SQLite handle failed +/// to open or migrations refused to apply. #[derive(Debug, Error)] pub enum StorageOpenError { #[error(transparent)] @@ -80,6 +204,9 @@ pub struct BatchPolicy { pub batch_size_target: u16, } +/// In-memory mirror of the latest open batch + frame. Mutated by `Storage` +/// methods that change the open state (`append_user_ops_chunk`, `close_*`). +/// The lane keeps one `WriteHead` and threads it through every call. #[derive(Debug, Clone, Copy)] pub struct WriteHead { pub batch_index: u64, diff --git a/sequencer/src/storage/mutations.rs b/sequencer/src/storage/mutations.rs new file mode 100644 index 0000000..39203f1 --- /dev/null +++ b/sequencer/src/storage/mutations.rs @@ -0,0 +1,134 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Write-side helpers shared across writer-role files. +//! +//! Like [`super::queries`] these take `&Transaction` so they compose inside a +//! larger atomic unit. The two consumers today are ingress (batch/frame close +//! + re-drain) and recovery (opening a recovery batch after cascade). + +use rusqlite::{Result, Transaction, params}; + +use super::SafeInputRange; +use super::convert::{i64_to_u64, u64_to_i64}; + +/// Insert a new batch. Nonce is derived from `parent_batch_index`: +/// `parent.nonce + 1`, or 0 if `parent_batch_index` is None (genesis or +/// post-cascade torn-state new Tip). +/// +/// If `batch_index_opt` is None, SQLite auto-assigns (highest existing +1). +/// The explicit form is used only by `initialize_open_state` to pin the +/// very first genesis batch at `batch_index = 0`. +/// +/// The `trg_enforce_nonce_contiguity` trigger verifies the nonce matches +/// `parent.nonce + 1`, so caller and schema agree. +pub(super) fn insert_new_batch( + tx: &Transaction<'_>, + batch_index_opt: Option, + parent_batch_index: Option, + created_at_ms: i64, +) -> Result { + let nonce = compute_next_nonce(tx, parent_batch_index)?; + match batch_index_opt { + Some(bi) => { + tx.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms) \ + VALUES (?1, ?2, ?3, ?4)", + params![ + u64_to_i64(bi), + parent_batch_index.map(u64_to_i64), + u64_to_i64(nonce), + created_at_ms + ], + )?; + Ok(bi) + } + None => { + tx.execute( + "INSERT INTO batches (parent_batch_index, nonce, created_at_ms) \ + VALUES (?1, ?2, ?3)", + params![ + parent_batch_index.map(u64_to_i64), + u64_to_i64(nonce), + created_at_ms + ], + )?; + Ok(i64_to_u64(tx.last_insert_rowid())) + } + } +} + +fn compute_next_nonce(tx: &Transaction<'_>, parent_batch_index: Option) -> Result { + match parent_batch_index { + None => Ok(0), + Some(parent_bi) => { + let parent_nonce: i64 = tx.query_row( + "SELECT nonce FROM batches WHERE batch_index = ?1", + params![u64_to_i64(parent_bi)], + |row| row.get(0), + )?; + Ok(i64_to_u64(parent_nonce).saturating_add(1)) + } + } +} + +/// Mark a batch as sealed (inclusion lane closed it). Write-once per the +/// `trg_sealed_at_ms_write_once` trigger. +pub(super) fn seal_batch(tx: &Transaction<'_>, batch_index: u64, sealed_at_ms: i64) -> Result<()> { + let changed = tx.execute( + "UPDATE batches SET sealed_at_ms = ?1 WHERE batch_index = ?2", + params![sealed_at_ms, u64_to_i64(batch_index)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(()) +} + +pub(super) fn insert_open_frame( + tx: &Transaction<'_>, + batch_index: u64, + frame_in_batch: u32, + created_at_ms: i64, + frame_fee: u16, + safe_block: u64, +) -> Result<()> { + tx.execute( + "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) \ + VALUES (?1, ?2, ?3, ?4, ?5)", + params![ + u64_to_i64(batch_index), + i64::from(frame_in_batch), + created_at_ms, + i64::from(frame_fee), + u64_to_i64(safe_block), + ], + )?; + Ok(()) +} + +/// Insert one `sequenced_l2_txs` row per safe-input index in `range` for the +/// given (batch, frame). Used by ingress (frame close) and recovery (re-drain +/// after cascade invalidation). +pub(super) fn persist_frame_direct_sequence( + tx: &Transaction<'_>, + batch_index: u64, + frame_in_batch: u32, + range: SafeInputRange, +) -> Result<()> { + if range.is_empty() { + return Ok(()); + } + let mut stmt = tx.prepare_cached( + "INSERT INTO sequenced_l2_txs (batch_index, frame_in_batch, user_op_pos_in_frame, safe_input_index) \ + VALUES (?1, ?2, NULL, ?3)", + )?; + for safe_input_index in range.start()..range.end() { + stmt.execute(params![ + u64_to_i64(batch_index), + i64::from(frame_in_batch), + u64_to_i64(safe_input_index), + ])?; + } + Ok(()) +} diff --git a/sequencer/src/storage/open.rs b/sequencer/src/storage/open.rs new file mode 100644 index 0000000..4a63514 --- /dev/null +++ b/sequencer/src/storage/open.rs @@ -0,0 +1,123 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! `Storage` struct definition plus connection-open and migration entry points. +//! +//! Method clusters live in sibling files (`ingress`, `egress`, `l1_inputs`, +//! `l1_submission`, `recovery`, `admin`) — each adds its own `impl Storage`. + +use rusqlite::{Connection, OpenFlags, Result, Transaction, TransactionBehavior}; +use rusqlite_migration::{M, Migrations}; + +use super::StorageOpenError; + +const MIGRATION_0001_SCHEMA: &str = include_str!("migrations/0001_schema.sql"); + +/// SQLite `synchronous` pragma used by every production writer connection. +/// `NORMAL` is appropriate under WAL — fsyncs at checkpoint boundaries, not +/// per-transaction. Tests use the same value; if a future test needs +/// `FULL`/`OFF`, add a `#[cfg(test)]` override. +const SYNCHRONOUS_PRAGMA: &str = "NORMAL"; + +/// Sequencer storage backed by a single SQLite database. +/// +/// All methods take `&mut self` to enforce exclusive access at the Rust level, +/// matching SQLite's single-writer model. Read-only access uses a separate +/// `Storage` instance opened via [`Storage::open_read_only`]. +pub struct Storage { + pub(super) conn: Connection, +} + +impl Storage { + /// Production open: runs migrations, uses the canonical synchronous pragma. + pub fn open(path: &str) -> Result { + let mut conn = open_writer_connection(path)?; + run_migrations(&mut conn)?; + Ok(Self { conn }) + } + + /// Read-only handle. Uses a 50ms `busy_timeout` (vs. 5s for writers) so + /// readers fail fast under write pressure and don't block on hot paths. + pub fn open_read_only(path: &str) -> Result { + let conn = open_reader_connection(path)?; + Ok(Self { conn }) + } + + /// Test-only: open without running migrations. Lets tests pre-seed the + /// schema before the migration runner touches it. + #[cfg(test)] + pub fn open_without_migrations(path: &str) -> Result { + let conn = open_writer_connection(path)?; + Ok(Self { conn }) + } + + /// Test-only: return a raw `Connection` with the same pragmas as + /// [`Storage::open`]. Used by tests that need to reach past the typed API + /// (e.g., rewinding `synced_at_ms`, installing failure triggers). + #[cfg(test)] + pub fn open_connection(path: &str) -> std::result::Result { + open_writer_connection(path) + } + + /// Run `f` inside a Deferred transaction, commit on success. For pure reads. + /// + /// Using Deferred rather than Immediate matches SQLite's default — readers + /// don't hold a write lock and don't block writers. If `f` returns `Err` + /// the transaction is dropped unsent (auto-rollback); on success the + /// commit is issued before returning `Ok`. + pub fn read(&mut self, f: F) -> Result + where + F: FnOnce(&Transaction<'_>) -> Result, + { + let tx = self + .conn + .transaction_with_behavior(TransactionBehavior::Deferred)?; + let out = f(&tx)?; + tx.commit()?; + Ok(out) + } + + /// Run `f` inside an Immediate transaction, commit on success. For any + /// mutation. + /// + /// Using Immediate acquires the write lock upfront so contending writers + /// see `SQLITE_BUSY` immediately rather than mid-transaction — this is + /// the right cadence under WAL + single-writer discipline. Same commit / + /// auto-rollback semantics as [`Storage::read`]. + pub fn write(&mut self, f: F) -> Result + where + F: FnOnce(&Transaction<'_>) -> Result, + { + let tx = self + .conn + .transaction_with_behavior(TransactionBehavior::Immediate)?; + let out = f(&tx)?; + tx.commit()?; + Ok(out) + } +} + +/// Open a read-write connection with WAL + `NORMAL` sync + 5s busy timeout. +fn open_writer_connection(path: &str) -> Result { + let conn = Connection::open(path)?; + conn.pragma_update(None, "foreign_keys", "ON")?; + conn.pragma_update(None, "journal_mode", "WAL")?; + conn.pragma_update(None, "synchronous", SYNCHRONOUS_PRAGMA)?; + conn.pragma_update(None, "busy_timeout", 5000)?; + Ok(conn) +} + +/// Open a read-only connection with `query_only` + 50ms busy timeout. +fn open_reader_connection(path: &str) -> Result { + let conn = Connection::open_with_flags(path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; + conn.pragma_update(None, "query_only", "ON")?; + conn.pragma_update(None, "busy_timeout", 50)?; + Ok(conn) +} + +/// Apply all migrations. Package-private — callers use [`Storage::open`] +/// which runs this automatically. +pub(super) fn run_migrations(conn: &mut Connection) -> Result<(), StorageOpenError> { + Migrations::from_slice(&[M::up(MIGRATION_0001_SCHEMA)]).to_latest(conn)?; + Ok(()) +} diff --git a/sequencer/src/storage/queries.rs b/sequencer/src/storage/queries.rs new file mode 100644 index 0000000..77af1e4 --- /dev/null +++ b/sequencer/src/storage/queries.rs @@ -0,0 +1,186 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Read-side helpers shared across writer-role files. +//! +//! These take a `&Connection` (or `&Transaction`, which derefs) rather than +//! `&mut Storage`, so they can compose inside a larger transaction built by +//! any writer role. Single-caller reads stay inline in the writer that owns +//! them; only the reads reused by two or more roles live here. + +use alloy_primitives::Address; +use rusqlite::{Connection, OptionalExtension, Result, Transaction, params}; + +use super::convert::{from_unix_ms, i64_to_u16, i64_to_u32, i64_to_u64}; +use super::{BatchPolicy, WriteHead}; +use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; + +// ── Write-head loading ─────────────────────────────────────────────────── +// +// Used by ingress (initialize/resume open state) and recovery (open recovery +// batch after cascade). The WriteHead is the in-memory mirror of the latest +// open batch/frame and must always match what's persisted in `batches` and +// `frames`. + +pub(super) fn load_current_write_head(tx: &Transaction<'_>) -> Result> { + // The Tip is the single row in `valid_open_batch` (enforced by + // `ux_single_valid_tip`). Returns None if there's no Tip (fresh DB, + // or torn state between cascade and recovery-batch open). + let latest_batch = match tx.query_row( + "SELECT + b.batch_index, + b.created_at_ms, + (SELECT COUNT(*) FROM user_ops u WHERE u.batch_index = b.batch_index) AS user_op_count + FROM valid_open_batch b", + [], + |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, i64>(1)?, + row.get::<_, i64>(2)?, + )) + }, + ) { + Ok(row) => row, + Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None), + Err(other) => return Err(other), + }; + let (batch_index_i64, batch_created_at_ms, batch_user_op_count_i64) = latest_batch; + + let (frame_in_batch_i64, frame_fee_i64, safe_block_i64): (i64, i64, i64) = tx.query_row( + "SELECT frame_in_batch, fee, safe_block FROM frames \ + WHERE batch_index = ?1 ORDER BY frame_in_batch DESC LIMIT 1", + params![batch_index_i64], + |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), + )?; + + let open_frame_user_op_count: i64 = tx.query_row( + "SELECT COUNT(*) FROM user_ops WHERE batch_index = ?1 AND frame_in_batch = ?2", + params![batch_index_i64, frame_in_batch_i64], + |row| row.get(0), + )?; + + let policy = query_batch_policy(tx)?; + Ok(Some(WriteHead { + batch_index: i64_to_u64(batch_index_i64), + batch_created_at: from_unix_ms(batch_created_at_ms), + frame_fee: i64_to_u16(frame_fee_i64), + safe_block: i64_to_u64(safe_block_i64), + batch_user_op_count: i64_to_u64(batch_user_op_count_i64), + open_frame_user_op_count: i64_to_u32(open_frame_user_op_count), + frame_in_batch: i64_to_u32(frame_in_batch_i64), + max_batch_user_op_bytes: super::batch_size_target_bytes(policy), + })) +} + +// ── Cross-writer scalar reads ───────────────────────────────────────────── + +pub(super) fn query_latest_safe_input_index_exclusive(conn: &Connection) -> Result { + let value: Option = + conn.query_row("SELECT MAX(safe_input_index) FROM safe_inputs", [], |row| { + row.get(0) + })?; + Ok(match value { + Some(last_index) => i64_to_u64(last_index).saturating_add(1), + None => 0, + }) +} + +pub(super) fn current_safe_block(conn: &Connection) -> Result> { + let value: Option = conn + .query_row( + "SELECT block_number FROM l1_safe_head WHERE singleton_id = 0 LIMIT 1", + [], + |row| row.get(0), + ) + .optional()?; + Ok(value.map(i64_to_u64)) +} + +/// Current safe block, or `QueryReturnedNoRows` if no observation has been +/// recorded yet. Use from code paths that only run after preemptive recovery +/// has produced a safe-head observation (submitter, lane, post-recovery +/// open-batch helpers); callers that legitimately handle the "never synced" +/// case should use [`current_safe_block`] instead. +pub(super) fn current_safe_block_required(conn: &Connection) -> Result { + current_safe_block(conn)?.ok_or(rusqlite::Error::QueryReturnedNoRows) +} + +/// L1 timestamp (Unix seconds) of the current safe block, or `None` if no +/// real safe-head observation has recorded one yet. +pub(super) fn current_safe_block_timestamp(conn: &Connection) -> Result> { + let value: Option = conn + .query_row( + "SELECT block_timestamp FROM l1_safe_head WHERE singleton_id = 0 LIMIT 1", + [], + |row| row.get(0), + ) + .optional()?; + Ok(value.map(i64_to_u64)) +} + +/// Wall-clock timestamp (Unix ms) of the last observed safe-head advance, or +/// `None` if no real safe-head observation has occurred yet. +pub(super) fn last_safe_progress_ms(conn: &Connection) -> Result> { + let value: Option = conn + .query_row( + "SELECT synced_at_ms FROM l1_safe_head WHERE singleton_id = 0", + [], + |row| row.get(0), + ) + .optional()?; + Ok(value.map(i64_to_u64)) +} + +pub(super) fn query_batch_policy(conn: &Connection) -> Result { + let (log_recommended_fee, log_batch_size_target): (i64, i64) = conn.query_row( + "SELECT log_recommended_fee, log_batch_size_target FROM batch_policy_derived \ + WHERE singleton_id = 0 LIMIT 1", + [], + |row| Ok((row.get(0)?, row.get(1)?)), + )?; + let max_exp = sequencer_core::fee::MAX_EXPONENT; + Ok(BatchPolicy { + // Clamp to MAX_EXPONENT to prevent panics in fee_to_linear. + recommended_fee: i64_to_u16(log_recommended_fee).min(max_exp), + batch_size_target: i64_to_u16(log_batch_size_target).min(max_exp), + }) +} + +// ── Ordered L2-tx row decoding ─────────────────────────────────────────── +// +// Used by egress paging and the per-batch replay reader. Each caller builds +// the row shape inside its own `query_map` closure and hands the fields to +// this decoder rather than defining an intermediate struct. + +pub(super) fn decode_l2_tx_row( + kind: i64, + sender: Option>, + data: Option>, + fee: Option, + payload: Option>, + block_number: Option, +) -> SequencedL2Tx { + let sender_bytes = sender.expect("ordered replay row: missing sender"); + assert_eq!( + sender_bytes.len(), + 20, + "ordered replay row: sender must be 20 bytes" + ); + if kind == 0 { + SequencedL2Tx::UserOp(ValidUserOp { + sender: Address::from_slice(sender_bytes.as_slice()), + // Replay uses the persisted frame fee (log-space exponent) to mirror canonical execution. + fee: i64_to_u16(fee.expect("ordered replay row: missing fee")), + data: data.expect("ordered replay row: missing data"), + }) + } else { + SequencedL2Tx::Direct(DirectInput { + sender: Address::from_slice(sender_bytes.as_slice()), + block_number: i64_to_u64( + block_number.expect("ordered replay row: missing block_number"), + ), + payload: payload.expect("ordered replay row: missing payload"), + }) + } +} diff --git a/sequencer/src/storage/queries/insert_sequenced_direct_input.sql b/sequencer/src/storage/queries/insert_sequenced_direct_input.sql deleted file mode 100644 index b382c5a..0000000 --- a/sequencer/src/storage/queries/insert_sequenced_direct_input.sql +++ /dev/null @@ -1,6 +0,0 @@ -INSERT INTO sequenced_l2_txs ( - batch_index, - frame_in_batch, - user_op_pos_in_frame, - safe_input_index -) VALUES (?1, ?2, NULL, ?3) diff --git a/sequencer/src/storage/queries/insert_user_op.sql b/sequencer/src/storage/queries/insert_user_op.sql deleted file mode 100644 index d86a72a..0000000 --- a/sequencer/src/storage/queries/insert_user_op.sql +++ /dev/null @@ -1,11 +0,0 @@ -INSERT INTO user_ops ( - batch_index, - frame_in_batch, - pos_in_frame, - sender, - nonce, - max_fee, - data, - sig, - received_at_ms -) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9) diff --git a/sequencer/src/storage/queries/select_latest_batch_with_user_op_count.sql b/sequencer/src/storage/queries/select_latest_batch_with_user_op_count.sql deleted file mode 100644 index ca7f9d0..0000000 --- a/sequencer/src/storage/queries/select_latest_batch_with_user_op_count.sql +++ /dev/null @@ -1,11 +0,0 @@ -SELECT - b.batch_index, - b.created_at_ms, - ( - SELECT COUNT(*) - FROM user_ops u - WHERE u.batch_index = b.batch_index - ) AS user_op_count -FROM batches b -ORDER BY b.batch_index DESC -LIMIT 1 diff --git a/sequencer/src/storage/queries/select_latest_frame_in_batch_for_batch.sql b/sequencer/src/storage/queries/select_latest_frame_in_batch_for_batch.sql deleted file mode 100644 index c2b5a43..0000000 --- a/sequencer/src/storage/queries/select_latest_frame_in_batch_for_batch.sql +++ /dev/null @@ -1,8 +0,0 @@ -SELECT - f.frame_in_batch, - f.fee, - f.safe_block -FROM frames f -WHERE f.batch_index = ?1 -ORDER BY f.frame_in_batch DESC -LIMIT 1 diff --git a/sequencer/src/storage/queries/select_ordered_l2_txs_for_batch.sql b/sequencer/src/storage/queries/select_ordered_l2_txs_for_batch.sql deleted file mode 100644 index 3dd8361..0000000 --- a/sequencer/src/storage/queries/select_ordered_l2_txs_for_batch.sql +++ /dev/null @@ -1,23 +0,0 @@ -SELECT - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, - CASE - WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender - WHEN s.safe_input_index IS NOT NULL THEN d.sender - ELSE NULL - END AS sender, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number -FROM sequenced_l2_txs s -LEFT JOIN user_ops u - ON u.batch_index = s.batch_index - AND u.frame_in_batch = s.frame_in_batch - AND u.pos_in_frame = s.user_op_pos_in_frame -LEFT JOIN frames f - ON f.batch_index = s.batch_index - AND f.frame_in_batch = s.frame_in_batch -LEFT JOIN safe_inputs d - ON d.safe_input_index = s.safe_input_index -WHERE s.batch_index = ?1 -ORDER BY s.offset ASC diff --git a/sequencer/src/storage/queries/select_ordered_l2_txs_from_offset.sql b/sequencer/src/storage/queries/select_ordered_l2_txs_from_offset.sql deleted file mode 100644 index 5c3d52a..0000000 --- a/sequencer/src/storage/queries/select_ordered_l2_txs_from_offset.sql +++ /dev/null @@ -1,23 +0,0 @@ -SELECT - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, - CASE - WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender - WHEN s.safe_input_index IS NOT NULL THEN d.sender - ELSE NULL - END AS sender, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number -FROM sequenced_l2_txs s -LEFT JOIN user_ops u - ON u.batch_index = s.batch_index - AND u.frame_in_batch = s.frame_in_batch - AND u.pos_in_frame = s.user_op_pos_in_frame -LEFT JOIN frames f - ON f.batch_index = s.batch_index - AND f.frame_in_batch = s.frame_in_batch -LEFT JOIN safe_inputs d - ON d.safe_input_index = s.safe_input_index -WHERE s.offset > ?1 -ORDER BY s.offset ASC diff --git a/sequencer/src/storage/queries/select_ordered_l2_txs_page_from_offset.sql b/sequencer/src/storage/queries/select_ordered_l2_txs_page_from_offset.sql deleted file mode 100644 index 9b3d8a6..0000000 --- a/sequencer/src/storage/queries/select_ordered_l2_txs_page_from_offset.sql +++ /dev/null @@ -1,24 +0,0 @@ -SELECT - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, - CASE - WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender - WHEN s.safe_input_index IS NOT NULL THEN d.sender - ELSE NULL - END AS sender, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number -FROM sequenced_l2_txs s -LEFT JOIN user_ops u - ON u.batch_index = s.batch_index - AND u.frame_in_batch = s.frame_in_batch - AND u.pos_in_frame = s.user_op_pos_in_frame -LEFT JOIN frames f - ON f.batch_index = s.batch_index - AND f.frame_in_batch = s.frame_in_batch -LEFT JOIN safe_inputs d - ON d.safe_input_index = s.safe_input_index -WHERE s.offset > ?1 -ORDER BY s.offset ASC -LIMIT ?2 diff --git a/sequencer/src/storage/queries/select_safe_inputs_range.sql b/sequencer/src/storage/queries/select_safe_inputs_range.sql deleted file mode 100644 index 3d82d7e..0000000 --- a/sequencer/src/storage/queries/select_safe_inputs_range.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT safe_input_index, sender, payload, block_number -FROM safe_inputs -WHERE safe_input_index >= ?1 AND safe_input_index < ?2 -ORDER BY safe_input_index ASC diff --git a/sequencer/src/storage/queries/select_user_op_count_for_frame.sql b/sequencer/src/storage/queries/select_user_op_count_for_frame.sql deleted file mode 100644 index e28ada7..0000000 --- a/sequencer/src/storage/queries/select_user_op_count_for_frame.sql +++ /dev/null @@ -1,3 +0,0 @@ -SELECT COUNT(*) -FROM user_ops -WHERE batch_index = ?1 AND frame_in_batch = ?2 diff --git a/sequencer/src/storage/recovery.rs b/sequencer/src/storage/recovery.rs new file mode 100644 index 0000000..ecb0108 --- /dev/null +++ b/sequencer/src/storage/recovery.rs @@ -0,0 +1,515 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Recovery writer: cascade-invalidates stale batches, opens recovery batches, +//! and composes the startup-recovery transaction. +//! +//! See `docs/recovery/README.md` for the full design (batch tree, coloring, +//! nonce poisoning, TLA+ proof). This file's job is to enforce that design +//! locally — read the design first if you're touching this code. +//! +//! Free functions here are shared with the batch submitter +//! (`l1_submission.rs`); they take `&Connection` / `&Transaction` so the +//! startup path can compose them into one atomic transaction. +//! +//! ## Fault model +//! +//! Recovery is robust to submission and outage failures (crashes, network +//! errors, mempool drops, extended downtime). It is NOT designed to defend +//! against arbitrarily malformed self-submissions: the scheduler-frontier +//! materialization in [`super::safe_accepted_batches`] trusts that on-chain +//! batches from the sequencer's own address are structurally valid. The +//! sequencer controls its own submissions — this is a deliberate system +//! assumption, not a gap. + +use rusqlite::{Connection, OptionalExtension, Result, Transaction, params}; +use sequencer_core::protocol::{ProtocolTiming, age_exceeds}; + +use super::Storage; +use super::convert::{i64_to_u64, now_unix_ms, u64_to_i64}; +use super::mutations::{insert_new_batch, insert_open_frame, persist_frame_direct_sequence}; +use super::queries::{ + current_safe_block_required, current_safe_block_timestamp, last_safe_progress_ms, + query_batch_policy, query_latest_safe_input_index_exclusive, +}; +use super::safe_accepted_batches::frontier_nonce; + +/// Outcome of a danger-zone check. +/// +/// Each variant maps to a distinct recovery response, encoded in +/// [`super::super::recovery::StartupAction`]: +/// +/// - `L1ViewStale` → refuse boot. The L1 safe block is too old or unknown. +/// - `ClosedBatchInDanger(closed_idx)` → flush + cascade. A closed batch past the +/// accepted frontier has L1 transactions that may already be on chain; +/// we need the flush to resolve their fate before cascading. +/// - `TipInDanger(tip_idx)` → direct Tip recovery, no flush. The Tip has no L1 +/// footprint, so we can invalidate it and open a fresh one without +/// any L1 round-trip. +/// - `EstimatedBatchInDanger(idx)` → refuse boot. The observed safe block is +/// still below the danger threshold, but wall-clock time since the last +/// safe-head advance has consumed the batch's remaining runway. +/// - `Safe` → no recovery work; just ensure the Tip exists (torn-state +/// crash recovery branch). +/// +/// The runtime danger detector treats every non-`Safe` variant as +/// "exit for recovery" — the difference between them only matters at the +/// next startup, where the dispatch differs. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DangerStatus { + /// No danger detected — none of the checks tripped. + Safe, + /// L1 safe-head timestamp is too old or unknown. Recovery cannot reason + /// from the local L1 view, so startup must refuse. + L1ViewStale, + /// Observed-safe check tripped on a *closed* batch past the + /// accepted frontier: aged beyond `protocol.danger_threshold()` against + /// the observed safe block. L1 view is fresh; flushing and cascading is + /// meaningful. + ClosedBatchInDanger(u64), + /// Observed-safe check tripped on the open *Tip*: aged beyond + /// `protocol.danger_threshold()` against the observed safe block, but + /// no closed batch is in danger. L1 view is fresh; the Tip has no L1 + /// footprint, so direct recovery (no flush) is correct. + TipInDanger(u64), + /// Batch-relative wall-clock estimate tripped after the global L1 view + /// freshness check passed. We refuse rather than recover because the batch + /// only crossed danger in estimated time, not observed safe-state. + EstimatedBatchInDanger(u64), +} + +impl Storage { + /// Unified danger-zone detection. + /// + /// Runs four checks inside a single read transaction, in priority order: + /// + /// 1. **L1 read freshness**: if the safe block timestamp is missing or + /// older than `protocol.l1_read_stale_after_blocks`, return + /// `L1ViewStale`. A stale L1 view is unusable even if the RPC answers. + /// 2. **Observed closed-frontier**: `find_closed_frontier_batch_in_danger` + /// against `protocol.danger_threshold()`. Uses the observed safe block. + /// 3. **Observed open Tip**: `find_tip_batch_in_danger` against + /// `protocol.danger_threshold()`. Catches the case where all closed + /// batches are gold but the Tip is aging — the lane is stuck or the + /// Tip rotated without a safe-block advance. + /// 4. **Batch-relative wall-clock estimate**: if a correction applies + /// ([`ProtocolTiming::wall_clock_adjusted_danger_threshold`] returns + /// `Some`), widens to `find_first_batch_in_danger` against + /// `danger_threshold − missed_blocks`. This is a fallback for when the + /// observed safe block has not crossed danger yet, but wall-clock time + /// since the last safe-head advance says the provider view is too stale + /// to trust for continued soft confirmations. + /// + /// Returns the first variant that fires, in the order + /// `L1ViewStale` → `ClosedBatchInDanger` → `TipInDanger` → + /// `EstimatedBatchInDanger` → `Safe`. The order encodes the + /// "trust" hierarchy: + /// + /// - **L1 view freshness gates everything.** If the safe block timestamp + /// is too old or unknown, neither recovery nor continued soft + /// confirmations are honest. + /// - **Closed observed danger beats Tip.** When a closed batch is in danger, + /// we need a flush (to resolve its L1 transaction's fate) regardless + /// of the Tip's state. The cascade naturally catches the Tip via + /// `batch_index >= N`. + /// - **Tip is the residual.** Only fires when no closed batch is in + /// danger. Routes to direct Tip recovery — no flush needed. + /// - **Estimated danger is the fallback.** If the observed safe-state checks have + /// not crossed the threshold, but wall-clock extrapolation says they + /// would have crossed had the safe head kept advancing, startup refuses + /// instead of issuing soft confirmations on a stale L1 view. + /// + /// `now_ms` is passed in (rather than read from `SystemTime::now()` here) + /// so the storage layer stays testable without time mocking. Production + /// callers pass the current Unix-ms clock. + pub fn check_danger(&mut self, protocol: &ProtocolTiming, now_ms: u64) -> Result { + self.read(|tx| { + if protocol.l1_view_is_stale(current_safe_block_timestamp(tx)?, now_ms) { + return Ok(DangerStatus::L1ViewStale); + } + + let danger_threshold = protocol.danger_threshold(); + if let Some(idx) = find_closed_frontier_batch_in_danger(tx, danger_threshold)? { + return Ok(DangerStatus::ClosedBatchInDanger(idx)); + } + + if let Some(idx) = find_tip_batch_in_danger(tx, danger_threshold)? { + return Ok(DangerStatus::TipInDanger(idx)); + } + + let last = last_safe_progress_ms(tx)?; + if let Some(adjusted) = protocol.wall_clock_adjusted_danger_threshold(last, now_ms) + && let Some(idx) = find_first_batch_in_danger(tx, adjusted)? + { + return Ok(DangerStatus::EstimatedBatchInDanger(idx)); + } + + Ok(DangerStatus::Safe) + }) + } + + /// Mark a single batch as invalid. Test-only seeder — production code goes + /// through [`Storage::recover_post_flush`] or [`Storage::recover_aging_tip`]. + /// Idempotent: leaves already-invalid rows alone. + #[cfg(test)] + pub(crate) fn insert_invalid_batch(&mut self, batch_index: u64) -> Result<()> { + let now_ms = now_unix_ms(); + self.conn.execute( + "UPDATE batches SET invalidated_at_ms = ?1 \ + WHERE batch_index = ?2 AND invalidated_at_ms IS NULL", + params![now_ms, u64_to_i64(batch_index)], + )?; + Ok(()) + } + + /// Cascade everything past the gold frontier. Called from the + /// `FlushAndCascade` startup path, after the mempool flush has resolved + /// every wallet-nonce slot and `safe_accepted_batches` has been re-synced. + /// + /// # The "everything past gold is doomed" rule + /// + /// At this point the gold frontier is at its maximum extent: every + /// submitted batch has either been accepted (gold) or rejected by the + /// scheduler simulation (Silver-stale, since nonce-mismatch is impossible + /// at the frontier under self-trust), or its tx was killed by a flush + /// no-op (Pending, no `safe_input`). All three non-gold states are doomed: + /// + /// - **Silver-stale:** scheduler skipped it; downstream batches are + /// nonce-poisoned. + /// - **Pending:** the original L1 tx is dead. Re-submission could in + /// principle land fresh, but the *next* recovery cycle's flush would + /// compete with the resub at its new wallet-nonce slot and the bumped + /// no-op typically wins. The system would loop until current staleness + /// crossed `MAX_WAIT_BLOCKS`. Cascading now converges in one cycle. + /// + /// So once we've committed to recovery (the danger detector tripped, the + /// flush ran), the right move is to cascade the entire non-gold suffix + /// and open a fresh recovery batch. + /// + /// Three aftermath shapes: + /// + /// 1. **Everything worked:** all in-flight batches landed fresh and were + /// accepted. Gold extends to the last submitted batch; no first + /// non-gold closed. (See "Tip handling" below for the subtle subcase.) + /// 2. **Mixed:** some landed (stale or poisoned), some replaced. First + /// non-gold closed is either Silver-stale or Pending. Cascade from + /// there; the `batch_index >= N` rule catches the rest of the suffix + /// including the open Tip. + /// 3. **All replaced:** flush no-ops won every race. Gold doesn't + /// advance; first non-gold closed is the very first non-accepted batch. + /// + /// # Tip handling + /// + /// In cases (2)/(3) the cascade catches the Tip via `batch_index >= N`. + /// In case (1), there's no closed pivot — but the Tip can still be in + /// the danger zone: + /// + /// When the lane rotates a batch without a safe-block advance between + /// frames (e.g. immediately after init, when both share the bootstrap + /// `safe_block`), the Tip's `first_frame.safe_block` equals the closed + /// batch's. The closed batch can become gold by inclusion-staleness + /// (`inclusion_block - first_frame < MAX_WAIT`) while the Tip's age, + /// computed against `current_safe_block` after the flush wait, has + /// crossed `danger_threshold`. Pure monotonicity (`S_tip ≥ S_closed`) doesn't + /// rule this out — equality is allowed. + /// + /// So in the no-pivot branch we additionally check the Tip against + /// `danger_threshold` (the same threshold that would have triggered + /// recovery had the Tip been a closed batch). We're already committed + /// to recovery; the Tip is past gold; if it's also in the danger zone, + /// cascade it and open a fresh one. + /// + /// # Atomicity + /// + /// Runs as a single SQLite write transaction. On crash mid-way, the + /// txn rolls back; on commit, the cascade and the recovery batch open + /// land together. Idempotent on re-run because `valid_*` views filter + /// out already-invalidated rows. + /// + /// # Precondition + /// + /// The caller MUST have just synced L1 state via + /// [`Storage::append_safe_inputs`]; the gold frontier in + /// `safe_accepted_batches` must reflect the latest safe head. Otherwise + /// the cascade may invalidate batches that haven't yet had a chance to + /// be processed by the scheduler simulation. + /// + /// Returns the newly-invalidated batch indices (empty if none). + pub fn recover_post_flush(&mut self, danger_threshold: u64) -> Result> { + self.write(|tx| recover_post_flush_inner(tx, danger_threshold)) + } + + /// Cascade the open Tip if its first frame has aged past + /// `danger_threshold`. Called from the `RecoverTip` startup path (no flush + /// happened), and defensively from `Proceed`. + /// + /// # Why a threshold here, but no closed-frontier check + /// + /// In the Proceed path no flush ran, so closed batches past the gold + /// frontier (if any) might still be in their natural lifecycle — + /// pending in the mempool, recently included, awaiting safe finality. + /// Cascading them would prematurely abort their progression. + /// + /// The Tip is different: it has no L1 footprint at all (no `w_nonce`, + /// no `safe_input`), so there's no L1 outcome to wait on. Once its + /// first frame has aged into the danger zone, the rule "everything + /// past gold is bad once we're committed to recovery" applies. In the + /// `RecoverTip` path startup is already committed; in `Proceed`, this + /// branch is defensive and should normally be a no-op. + /// + /// # Threshold = danger_threshold, not MAX_WAIT + /// + /// We use `danger_threshold` (= `MAX_WAIT_BLOCKS - margin`) rather than + /// `MAX_WAIT_BLOCKS`. The Tip threshold is the same one that would + /// trigger the recovery cycle had the Tip been a closed batch. If the + /// Tip is past that threshold, the next danger detector tick after + /// resume would re-trip on the Tip's eventual first close + submission + /// anyway (the closed batch would inherit its first frame's safe_block). + /// Cascading now saves the cycle. + /// + /// # Precondition + /// + /// As with [`Storage::recover_post_flush`], the caller must have synced + /// L1 state. (Threshold comparison reads `current_safe_block` from + /// `l1_safe_head`.) + /// + /// Returns the newly-invalidated batch indices (empty if Tip is fresh, + /// `[tip_index]` when the Tip was cascaded). + pub fn recover_aging_tip(&mut self, danger_threshold: u64) -> Result> { + self.write(|tx| recover_aging_tip_inner(tx, danger_threshold)) + } +} + +// ── Free functions used by both recovery and the batch submitter ────────── + +/// See [`Storage::recover_post_flush`] for the design rationale. +fn recover_post_flush_inner(tx: &Transaction<'_>, danger_threshold: u64) -> Result> { + // Path 1: any closed batch past gold cascades unconditionally. + let pivot = match first_non_gold_closed_batch(tx)? { + Some(batch_index) => Some(batch_index), + // Path 2 (corner case): all closed are gold, but the Tip might be + // in the danger zone — see `recover_post_flush` doc on Tip handling. + None => find_tip_batch_in_danger(tx, danger_threshold)?, + }; + let invalidated = match pivot { + Some(batch_index) => cascade_invalidate_from(tx, batch_index)?, + None => Vec::new(), + }; + if !invalidated.is_empty() || !has_valid_open_batch(tx)? { + open_recovery_batch_in_tx(tx)?; + } + Ok(invalidated) +} + +/// See [`Storage::recover_aging_tip`] for the design rationale. +fn recover_aging_tip_inner(tx: &Transaction<'_>, danger_threshold: u64) -> Result> { + let invalidated = match find_tip_batch_in_danger(tx, danger_threshold)? { + Some(batch_index) => cascade_invalidate_from(tx, batch_index)?, + None => Vec::new(), + }; + if !invalidated.is_empty() || !has_valid_open_batch(tx)? { + open_recovery_batch_in_tx(tx)?; + } + Ok(invalidated) +} + +/// First valid closed batch sitting at the gold frontier — i.e., with +/// `nonce >= frontier_nonce` (the next nonce the scheduler is expected to +/// accept). Used by [`recover_post_flush_inner`] as the cascade pivot, and +/// by [`find_closed_frontier_batch_in_danger`] as the candidate to age-check. +/// +/// `>=`, not `>`: `frontier_nonce` is the *next-expected* nonce +/// (`latest_accepted.nonce + 1`), so the actual cascade-pivot batch carries +/// `nonce == frontier_nonce`. Using `>` would skip it. +/// +/// On the valid path, batch nonces are contiguous (enforced by the +/// `trg_enforce_nonce_contiguity` trigger), so the first match always has +/// `nonce == frontier_nonce`. We don't double-check that invariant here — +/// the trigger is the source of truth (see AGENTS.md "Self-trust": no +/// defense-in-depth checks against the sequencer's own bugs). Returns +/// `None` if all closed batches are gold. +fn first_non_gold_closed_batch(conn: &Connection) -> Result> { + let frontier = frontier_nonce(conn)?; + let batch_index: Option = conn + .query_row( + "SELECT batch_index FROM valid_closed_batches \ + WHERE nonce >= ?1 ORDER BY nonce ASC LIMIT 1", + rusqlite::params![u64_to_i64(frontier)], + |row| row.get(0), + ) + .optional()?; + Ok(batch_index.map(i64_to_u64)) +} + +/// Either the closed-frontier batch or the Tip, whichever (if either) has +/// aged past `threshold` against `current_safe_block`. Used by +/// [`Storage::check_danger`]'s wall-clock-adjusted arm, where the dispatch +/// is the same (`Refuse`) regardless of which one fired. +/// +/// Closed-frontier wins ties: if a closed batch is in danger, the Tip is +/// older still (sequencer opens new batches at non-decreasing `safe_block`), +/// and cascading from the closed batch covers the Tip via +/// `batch_index >= N`. +/// +/// Reads `safe_accepted_batches`, which is maintained atomically with each +/// [`Storage::append_safe_inputs`] call. +pub(super) fn find_first_batch_in_danger(conn: &Connection, threshold: u64) -> Result> { + if let Some(batch_index) = find_closed_frontier_batch_in_danger(conn, threshold)? { + return Ok(Some(batch_index)); + } + find_tip_batch_in_danger(conn, threshold) +} + +/// First valid closed batch past the gold frontier whose first frame is older +/// than `current_safe_block - threshold`. Returns `None` if no such batch +/// exists. +/// +/// Why look only at the frontier batch, not "every batch past gold"? +/// `safe_accepted_batches` is updated atomically with each safe-head advance +/// (see [`super::safe_accepted_batches`]) and walks the spine until it hits +/// a barrier — a stale batch, or a missing slot the scheduler can't bridge. +/// So the first batch past the frontier IS the barrier; downstream batches +/// are nonce-poisoned by definition (a stale frontier ⇒ scheduler skips ⇒ +/// every later batch arrives at an unexpected nonce). Looking further is +/// redundant. +/// +/// Does NOT consider the Tip — the Tip has no L1 transaction, so it's not +/// part of the closed-frontier-staleness category. +/// [`find_first_batch_in_danger`] composes with [`find_tip_batch_in_danger`] +/// when callers want both. +pub(super) fn find_closed_frontier_batch_in_danger( + conn: &Connection, + threshold: u64, +) -> Result> { + match first_non_gold_closed_batch(conn)? { + Some(batch_index) => batch_in_danger(conn, batch_index, threshold), + None => Ok(None), + } +} + +/// The Tip (if any) whose first frame is older than +/// `current_safe_block - threshold`. Returns `None` if no Tip exists or it +/// isn't in danger yet. +fn find_tip_batch_in_danger(conn: &Connection, threshold: u64) -> Result> { + let tip_batch_index: Option = conn + .query_row("SELECT batch_index FROM valid_open_batch", [], |row| { + row.get(0) + }) + .optional()?; + match tip_batch_index { + Some(tip_batch_index) => batch_in_danger(conn, i64_to_u64(tip_batch_index), threshold), + None => Ok(None), + } +} + +/// Shared age-check used by the closed-frontier and Tip helpers. Returns +/// `Some(batch_index)` if `current_safe_block - first_frame.safe_block >= threshold`. +fn batch_in_danger(conn: &Connection, batch_index: u64, threshold: u64) -> Result> { + let first_frame_safe_block = first_frame_safe_block_of(conn, u64_to_i64(batch_index))?; + let safe_block = current_safe_block_required(conn)?; + Ok(age_exceeds(safe_block, first_frame_safe_block, threshold).then_some(batch_index)) +} + +/// `frames.safe_block` of the lowest `frame_in_batch` in `batch_index`. +/// Returns 0 if the batch has no frames yet. +fn first_frame_safe_block_of(conn: &Connection, batch_index: i64) -> Result { + let value: Option = conn + .query_row( + "SELECT safe_block FROM frames \ + WHERE batch_index = ?1 ORDER BY frame_in_batch ASC LIMIT 1", + params![batch_index], + |row| row.get(0), + ) + .optional()?; + Ok(i64_to_u64(value.unwrap_or(0))) +} + +/// Cascade-invalidate all valid batches with `batch_index >= from_batch_index`. +/// +/// Reads the list BEFORE mutating — the SELECT must see the rows the UPDATE +/// will then mark invalid. The `invalidated_at_ms IS NULL` guard on the UPDATE +/// keeps this idempotent: rows already invalid are untouched. +fn cascade_invalidate_from(tx: &Transaction<'_>, from_batch_index: u64) -> Result> { + let from_i64 = u64_to_i64(from_batch_index); + + let invalidated: Vec = { + let mut stmt = tx.prepare( + "SELECT batch_index FROM valid_batches \ + WHERE batch_index >= ?1 ORDER BY batch_index ASC", + )?; + stmt.query_map(params![from_i64], |row| { + row.get::<_, i64>(0).map(i64_to_u64) + })? + .collect::>()? + }; + + if !invalidated.is_empty() { + let now_ms = now_unix_ms(); + tx.execute( + "UPDATE batches SET invalidated_at_ms = ?1 \ + WHERE batch_index >= ?2 AND invalidated_at_ms IS NULL", + params![now_ms, from_i64], + )?; + } + + Ok(invalidated) +} + +/// Check whether the DB has a valid Tip (`sealed_at_ms IS NULL AND +/// `invalidated_at_ms IS NULL`). +fn has_valid_open_batch(tx: &Connection) -> Result { + let count: i64 = tx.query_row("SELECT COUNT(*) FROM valid_open_batch", [], |row| { + row.get(0) + })?; + Ok(count > 0) +} + +/// Open a fresh recovery batch inside an existing transaction. +/// +/// The new Tip's parent is the highest-indexed valid batch (the last valid +/// ancestor after the cascade). If none exists — the torn-state case where +/// every batch has been invalidated — the new Tip has no parent (nonce 0, +/// like a fresh genesis). +/// +/// Requires an observed safe head (uses `current_safe_block_required`); only +/// reachable from `recover_post_flush` / `recover_aging_tip`, which run from +/// `run_preemptive_recovery` after a successful sync or behind the +/// `L1ViewStale` refusal gate. +fn open_recovery_batch_in_tx(tx: &Transaction<'_>) -> Result<()> { + let now_ms = now_unix_ms(); + let safe_block = current_safe_block_required(tx)?; + + let parent_batch_index: Option = tx + .query_row("SELECT MAX(batch_index) FROM valid_batches", [], |row| { + row.get::<_, Option>(0) + })? + .map(i64_to_u64); + + let policy = query_batch_policy(tx)?; + let next_bi = insert_new_batch(tx, None, parent_batch_index, now_ms)?; + insert_open_frame(tx, next_bi, 0, now_ms, policy.recommended_fee, safe_block)?; + + // Drain leading directs into the new batch's first frame. + // Direct inputs from invalidated batches are re-drained into the recovery batch + // (the UNIQUE(safe_input_index) constraint was removed to allow this). + let next_undrained: u64 = { + // MAX(safe_input_index) + 1 over the valid drained rows. Cursor rewinds + // when a batch is invalidated, so the recovery batch sees the same + // undrained range its invalidated predecessor was working from. + let value: i64 = tx.query_row( + "SELECT COALESCE(MAX(safe_input_index) + 1, 0) FROM valid_sequenced_l2_txs \ + WHERE safe_input_index IS NOT NULL", + [], + |row| row.get(0), + )?; + i64_to_u64(value) + }; + let safe_input_end = query_latest_safe_input_index_exclusive(tx)?; + let leading_range = super::SafeInputRange::new(next_undrained, safe_input_end); + persist_frame_direct_sequence(tx, next_bi, 0, leading_range)?; + Ok(()) +} + +#[cfg(test)] +#[path = "recovery_tests.rs"] +mod tests; diff --git a/sequencer/src/storage/recovery_tests.rs b/sequencer/src/storage/recovery_tests.rs new file mode 100644 index 0000000..4d672e8 --- /dev/null +++ b/sequencer/src/storage/recovery_tests.rs @@ -0,0 +1,2184 @@ +use super::super::test_helpers::{ + SENDER_A, all_ordered_l2_txs, default_protocol_timing, make_stale_batch_payload, + seed_closed_batches, temp_db, +}; +use super::{find_closed_frontier_batch_in_danger, find_first_batch_in_danger}; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; +use alloy_primitives::Address; +use sequencer_core::l2_tx::SequencedL2Tx; + +mod invalid_batches { + use super::*; + + // ── invalid_batches filtering ────────────────────────────────────── + + #[test] + fn invalid_batches_excluded_from_latest_batch_index() { + let db = temp_db("invalid-latest-batch"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + seed_closed_batches(&mut storage, 3); + assert_eq!( + storage.latest_batch_index().expect("latest").unwrap(), + 3, + "open batch should be 3" + ); + + storage.insert_invalid_batch(3).expect("mark invalid"); + assert_eq!(storage.latest_batch_index().expect("latest").unwrap(), 2,); + + storage.insert_invalid_batch(2).expect("mark invalid"); + assert_eq!(storage.latest_batch_index().expect("latest").unwrap(), 1,); + } + + #[test] + fn invalid_batches_excluded_from_ordered_l2_txs() { + let db = temp_db("invalid-ordered-txs"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let directs_0 = vec![StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }]; + storage + .append_safe_inputs( + 10, + directs_0.as_slice(), + SENDER_A, + &default_protocol_timing(), + ) + .expect("append"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let directs_1 = vec![StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 20, + }]; + storage + .append_safe_inputs( + 20, + directs_1.as_slice(), + SENDER_A, + &default_protocol_timing(), + ) + .expect("append"); + storage + .close_frame_only(&mut head, 20, SafeInputRange::new(1, 2)) + .expect("close frame"); + + let all = all_ordered_l2_txs(&mut storage); + assert_eq!(all.len(), 2); + + storage.insert_invalid_batch(0).expect("mark invalid"); + + let filtered = all_ordered_l2_txs(&mut storage); + assert_eq!(filtered.len(), 1); + match &filtered[0] { + SequencedL2Tx::Direct(d) => assert_eq!(d.payload.as_slice(), &[0xbb]), + _ => panic!("expected direct input"), + } + } + + #[test] + fn invalid_batches_excluded_from_ordered_l2_txs_for_batch() { + let db = temp_db("invalid-ordered-for-batch"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let directs = vec![StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }]; + storage + .append_safe_inputs(10, directs.as_slice(), SENDER_A, &default_protocol_timing()) + .expect("append"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let txs = storage.ordered_l2_txs_for_batch(0).expect("load batch 0"); + assert_eq!(txs.len(), 1); + + storage.insert_invalid_batch(0).expect("mark invalid"); + let txs = storage + .ordered_l2_txs_for_batch(0) + .expect("load batch 0 after invalidation"); + assert!(txs.is_empty(), "invalid batch should return no txs"); + } + + #[test] + fn invalid_batches_excluded_from_drained_direct_count() { + let db = temp_db("invalid-drained-count"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let directs = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, directs.as_slice(), SENDER_A, &default_protocol_timing()) + .expect("append"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 2)) + .expect("close frame"); + assert_eq!( + storage.next_undrained_safe_input_index().expect("cursor"), + 2 + ); + + storage.insert_invalid_batch(0).expect("mark invalid"); + assert_eq!( + storage + .next_undrained_safe_input_index() + .expect("cursor after invalidation"), + 0 + ); + } +} + +mod recover_post_flush { + use super::*; + + // ── recover_post_flush: cascade from first non-gold ──────────────── + // + // These tests simulate the post-flush state directly: append safe-inputs + // to drive `populate_safe_accepted_batches`, then call + // `recover_post_flush()`. The cascade is unconditional (no threshold) — + // any closed batch past the gold frontier is doomed and gets cascaded + // along with everything after it. + + #[test] + fn cascades_from_stale() { + let db = temp_db("detect-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..3 { + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch"); + } + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + let invalidated = storage + .recover_post_flush(1200) + .expect("detect and recover"); + assert_eq!(invalidated, vec![0, 1, 2, 3]); + + let head = storage.open_state().expect("load open state"); + assert!(head.is_some(), "recovery should have opened a fresh batch"); + assert_eq!(head.unwrap().batch_index, 4); + } + + #[test] + fn is_idempotent() { + let db = temp_db("detect-idempotent"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + let first = storage.recover_post_flush(1200).expect("first detect"); + assert_eq!(first, vec![0, 1]); + + let second = storage.recover_post_flush(1200).expect("second detect"); + assert!(second.is_empty()); + } + + #[test] + fn no_op_when_recovery_batch_landed_fresh_in_new_generation() { + // Post-flush contract: `recover_post_flush` is called only after the + // gold frontier has been re-synced. If the recovery batch from a + // previous cycle landed on L1 fresh (within MAX_WAIT) and was + // accepted by `populate_safe_accepted_batches`, the new generation + // is gold and a fresh `recover_post_flush` call should be a no-op — + // the stale ancestor's safe-input must not false-match the + // nonce-reused gen-2 batch. + let db = temp_db("post-flush-no-op-after-fresh-gen2"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append stale safe input"); + // Gen 1 cascade. Stale batch 0 + open Tip 1 invalidated; recovery + // batch 2 opened with nonce reused (= 0). + let first = storage.recover_post_flush(1200).expect("gen1 recovery"); + assert_eq!(first, vec![0, 1]); + + // Submitter posts the recovery batch; it lands fresh on L1. + let mut head = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head, 1300) + .expect("close recovery batch"); + storage + .append_safe_inputs( + 1310, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 1300), + block_number: 1310, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append fresh safe input for recovery batch"); + + // populate_safe_accepted_batches accepts the gen-2 batch (nonce 0 at + // first_frame=1300, inclusion=1310 → inclusion-staleness 10 < MAX_WAIT). + // Gold advances. A fresh recover_post_flush is a no-op: the only + // closed-non-gold candidate would have to be past the new frontier, + // but everything is gold. + let second = storage.recover_post_flush(1200).expect("post-flush no-op"); + assert!( + second.is_empty(), + "fresh gen-2 batch must be gold; old stale row must not false-match \ + via reused nonce, got: {second:?}" + ); + } + + #[test] + fn detects_stale_reused_nonce_in_new_generation() { + let db = temp_db("detect-reused-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append gen1 stale safe input"); + let first = storage.recover_post_flush(1200).expect("gen1 recovery"); + assert_eq!(first, vec![0, 1]); + + let mut head = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close gen2 batch"); + + storage + .append_safe_inputs( + 2410, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 100), + block_number: 2410, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append gen2 stale safe input"); + let second = storage.recover_post_flush(1200).expect("gen2 recovery"); + assert_eq!( + second, + vec![2, 3], + "stale reused nonce in gen2 must still be detected" + ); + } + + #[test] + fn cascades_aging_tip_when_no_closed_pivot_exists() { + // Regression for the no-pivot Tip case: the closed batch landed fresh + // (becomes gold) but the Tip's `first_frame.safe_block` matches the + // closed batch's, and after the flush wait the current safe block has + // pushed the Tip's age into the danger zone. Pure monotonicity + // (`S_tip >= S_closed`) doesn't rule this out — equality is allowed + // when the lane rotates without a safe-block advance between frames. + // + // Earlier `recover_post_flush` only checked the first non-gold closed + // batch; on no closed pivot it became a no-op, leaving the stale Tip + // for the next danger trip to catch. Fix: fall through to a Tip + // check against `danger_threshold` and cascade the Tip directly. + let db = temp_db("post-flush-tip-no-pivot"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // S_closed = S_tip = 10 (lane rotated without a safe-block advance). + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0; open Tip with same safe_block=10"); + + let batch_submitter = Address::repeat_byte(0xAA); + // Closed batch 0 landed fresh: inclusion_block=200 → inclusion-staleness 190 + // is well below MAX_WAIT, so populate accepts it as gold. + // Then advance safe head to 1100: batch 0 age (gold) is irrelevant, + // but Tip's age = 1100 - 10 = 1090 > danger_threshold (let's pass 1000). + storage + .append_safe_inputs( + 200, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 200, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append fresh safe input — batch 0 becomes gold"); + storage + .append_safe_inputs(1100, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head past Tip's danger threshold"); + + // Sanity: no closed batch is past gold (all gold). + assert_eq!( + find_closed_frontier_batch_in_danger(&storage.conn, 900).expect("strict check"), + None, + "all closed batches gold; closed-frontier check returns None" + ); + + // recover_post_flush(danger_threshold=1000): Tip's age (1090) > 1000. + // Fall-through Tip check fires; Tip cascaded; recovery batch opened. + let invalidated = storage + .recover_post_flush(1000) + .expect("recover_post_flush with danger_threshold=1000"); + assert_eq!( + invalidated, + vec![1], + "Tip (batch 1) cascaded via danger_threshold fall-through" + ); + + // A fresh recovery Tip exists with a higher batch_index. + let head = storage.open_state().expect("load").expect("recovery Tip"); + assert_eq!(head.batch_index, 2); + } + + #[test] + fn no_op_when_tip_age_below_danger_threshold() { + // Negative companion to the above: closed batch is gold, Tip exists, + // but Tip's age is below `danger_threshold`. Cascade must not fire. + let db = temp_db("post-flush-tip-below-danger"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 200, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 200, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append fresh safe input — batch 0 becomes gold"); + // current=500: Tip's age = 490, below threshold 1000. + storage + .append_safe_inputs(500, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head, but not past danger threshold"); + + let invalidated = storage + .recover_post_flush(1000) + .expect("recover_post_flush with fresh Tip"); + assert!( + invalidated.is_empty(), + "Tip below danger_threshold must not be cascaded, got: {invalidated:?}" + ); + + // Tip preserved. + let head = storage.open_state().expect("load").expect("Tip"); + assert_eq!(head.batch_index, 1); + } +} + +mod tip_staleness { + use super::*; + + // ── Tip staleness — `recover_aging_tip` and post-flush combined cases ── + // + // `recover_aging_tip` backs the RecoverTip startup action: all closed + // batches are outside the danger zone, but the open Tip has aged past the + // configured threshold. The Tip has no L1 footprint (no `w_nonce`, no + // `safe_input`), so it can be invalidated directly without a flush. + // + // The first four tests cover that path: + // - positive: Tip IS stale → invalidated + // - negative: Tip is fresh → NOT invalidated (no false positives) + // - boundary at threshold: invalidated + // - boundary just below threshold: not invalidated + // + // The remaining tests cover the FlushAndCascade path's combined + // closed+open behavior (`recover_post_flush`'s `batch_index >= N` + // cascade rule catches the Tip too). + + #[test] + fn open_batch_stale_by_current_safe_block_is_invalidated() { + // Scenario: sequencer opened batch 0 at safe_block=10, never closed it, + // then stayed down until safe advanced to 1500 (>1200 past safe_block). + // Recovery must invalidate the open batch. + let db = temp_db("open-batch-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open state at safe_block=10"); + + // Advance the safe head so the open batch's first frame (safe_block=10) + // is now stale: 1500 - 10 >= 1200. + storage + .append_safe_inputs(1500, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head past MAX_WAIT_BLOCKS"); + + let invalidated = storage + .recover_aging_tip(1200) + .expect("recover from stale open batch"); + assert_eq!( + invalidated, + vec![0], + "open batch 0 should be invalidated by current staleness" + ); + + // A fresh recovery batch must be opened at batch_index=1. + let head = storage.open_state().expect("load").expect("head"); + assert_eq!(head.batch_index, 1, "recovery batch is the next index"); + } + + #[test] + fn open_batch_not_yet_stale_is_not_invalidated() { + // Negative: open batch's first frame safe_block=10 with current safe=1100. + // 1100 - 10 = 1090 < 1200. Must NOT cascade. + // Catches false-positive regressions in `recover_aging_tip`. + let db = temp_db("open-batch-fresh"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open state at safe_block=10"); + + storage + .append_safe_inputs(1100, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head below threshold"); + + let invalidated = storage + .recover_aging_tip(1200) + .expect("recover with non-stale open batch"); + assert!( + invalidated.is_empty(), + "fresh open batch must not be cascade-invalidated, got: {invalidated:?}" + ); + + // The open batch must still be the live one (no recovery batch opened). + let head = storage.open_state().expect("load").expect("head"); + assert_eq!( + head.batch_index, 0, + "original open batch 0 must still be the head" + ); + } + + #[test] + fn open_batch_exactly_at_threshold_is_invalidated() { + // Boundary: 1210 - 10 = 1200, which is >= MAX_WAIT_BLOCKS. + // The staleness comparison is `>=`, so this must invalidate. + let db = temp_db("open-batch-boundary"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + + storage + .append_safe_inputs(1210, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head to exact threshold"); + + let invalidated = storage.recover_aging_tip(1200).expect("recover"); + assert_eq!(invalidated, vec![0], "boundary (>= threshold) invalidates"); + } + + #[test] + fn open_batch_one_block_below_threshold_is_not_invalidated() { + // Boundary: 1209 - 10 = 1199 < 1200. One-block margin must NOT invalidate. + let db = temp_db("open-batch-below-boundary"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + + storage + .append_safe_inputs(1209, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head to one block below threshold"); + + let invalidated = storage.recover_aging_tip(1200).expect("recover"); + assert!( + invalidated.is_empty(), + "one-block-below-threshold must not invalidate, got: {invalidated:?}" + ); + } + + #[test] + fn closed_unsubmitted_stale_and_open_stale_both_cascade() { + // Scenario: batch 0 is closed and nonced but never submitted to L1 + // (safe_accepted_batches is empty). Batch 1 is open and also stale. + // `find_first_batch_in_danger` should return closed batch 0 at the + // frontier (nonce 0, no acceptance yet) and cascade through batch 1. + let db = temp_db("closed-unsubmitted-and-open-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize at safe_block=10"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + // Advance safe head so batch 0's first frame (safe_block=10) is stale. + storage + .append_safe_inputs(1500, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head past staleness"); + + let invalidated = storage.recover_post_flush(1200).expect("recover"); + assert_eq!( + invalidated, + vec![0, 1], + "closed unsubmitted batch 0 and subsequent open batch 1 cascade together" + ); + } + + #[test] + fn opens_batch_after_torn_invalidation() { + let db = temp_db("detect-torn"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + storage.insert_invalid_batch(0).expect("invalidate 0"); + storage.insert_invalid_batch(1).expect("invalidate 1"); + storage + .append_safe_inputs(10, &[], SENDER_A, &default_protocol_timing()) + .expect("record observed safe head"); + + let invalidated = storage + .recover_post_flush(1200) + .expect("recover from torn state"); + assert!(invalidated.is_empty(), "no new invalidations"); + + let head = storage.open_state().expect("load open state"); + assert!(head.is_some(), "recovery should have opened a fresh batch"); + assert_eq!(head.unwrap().batch_index, 2); + } + + #[test] + fn rolls_back_when_cascade_update_aborts() { + let db = temp_db("detect-cascade-abort"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + // Advance safe head so batch 0's first frame (safe_block=10) is stale. + storage + .append_safe_inputs(1500, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head past staleness"); + + storage + .conn + .execute_batch( + "CREATE TRIGGER fail_cascade_invalidation + AFTER UPDATE OF invalidated_at_ms ON batches + WHEN NEW.invalidated_at_ms IS NOT NULL + AND OLD.invalidated_at_ms IS NULL + BEGIN + SELECT RAISE(ABORT, 'injected cascade failure'); + END;", + ) + .expect("install failure trigger"); + + let err = storage + .recover_post_flush(1200) + .expect_err("trigger should abort recovery transaction"); + assert!( + err.to_string().contains("injected cascade failure"), + "unexpected error: {err:?}" + ); + drop(storage); + + let conn = Storage::open_connection(db.path.as_str()).expect("open read conn"); + let invalidated_count: i64 = conn + .query_row( + "SELECT COUNT(*) FROM batches WHERE invalidated_at_ms IS NOT NULL", + [], + |row| row.get(0), + ) + .expect("count invalidated"); + assert_eq!( + invalidated_count, 0, + "failed cascade must not persist torn invalidation state" + ); + + let batch_count: i64 = conn + .query_row("SELECT COUNT(*) FROM batches", [], |row| row.get(0)) + .expect("count batches"); + assert_eq!( + batch_count, 2, + "failed recovery must not open an extra batch" + ); + + let open_batch_index: i64 = conn + .query_row("SELECT batch_index FROM valid_open_batch", [], |row| { + row.get(0) + }) + .expect("query valid open batch"); + assert_eq!( + open_batch_index, 1, + "failed recovery must leave the original Tip in place" + ); + } + + #[test] + fn recovery_redrains_direct_inputs_and_replay_sees_them_once() { + let db = temp_db("recovery-redrain-e2e"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + let deposits = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xd1], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xd2], + block_number: 10, + }, + ]; + storage + .append_safe_inputs( + 10, + deposits.as_slice(), + SENDER_A, + &default_protocol_timing(), + ) + .expect("append deposits"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 2)) + .expect("close frame with deposits"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let before = all_ordered_l2_txs(&mut storage); + assert_eq!(before.len(), 2, "both deposits should be visible"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append stale batch submission"); + let invalidated = storage + .recover_post_flush(1200) + .expect("detect and recover"); + assert!(!invalidated.is_empty(), "should have invalidated batches"); + + let after = all_ordered_l2_txs(&mut storage); + let direct_payloads: Vec<&[u8]> = after + .iter() + .filter_map(|tx| match tx { + SequencedL2Tx::Direct(d) if d.sender != batch_submitter => { + Some(d.payload.as_slice()) + } + _ => None, + }) + .collect(); + assert_eq!( + direct_payloads, + vec![&[0xd1][..], &[0xd2][..]], + "deposits must appear exactly once in replay after recovery" + ); + + let recovery_batch = storage.open_state().expect("load").unwrap(); + let recovery_txs = storage + .ordered_l2_txs_for_batch(recovery_batch.batch_index) + .expect("load recovery batch txs"); + let recovery_direct_count = recovery_txs + .iter() + .filter(|tx| matches!(tx, SequencedL2Tx::Direct(d) if d.sender != batch_submitter)) + .count(); + assert_eq!( + recovery_direct_count, 2, + "both deposits should be in the recovery batch" + ); + } + + #[test] + fn undrained_safe_input_appears_in_recovery_batch_first_frame() { + // a deposit ingested into safe_inputs but not yet drained + // into any frame must be sequenced into the recovery batch's first + // frame after cascade. Complements (re-drain from + // invalidated) with the never-drained case. + let db = temp_db("recovery-includes-undrained"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0 with no deposits"); + + let non_submitter = Address::repeat_byte(0xCC); + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: non_submitter, + payload: vec![0xde, 0xad], + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append undrained deposit"); + let before = all_ordered_l2_txs(&mut storage); + assert!( + before.iter().all(|tx| !matches!( + tx, + SequencedL2Tx::Direct(d) if d.sender == non_submitter + )), + "undrained deposit must not be sequenced before drain", + ); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append stale batch submission"); + let invalidated = storage.recover_post_flush(1200).expect("recover"); + assert!(!invalidated.is_empty(), "stale batch must cascade"); + + let recovery = storage.open_state().expect("load").unwrap(); + let recovery_txs = storage + .ordered_l2_txs_for_batch(recovery.batch_index) + .expect("load recovery batch txs"); + let deposit_payloads: Vec<&[u8]> = recovery_txs + .iter() + .filter_map(|tx| match tx { + SequencedL2Tx::Direct(d) if d.sender == non_submitter => Some(d.payload.as_slice()), + _ => None, + }) + .collect(); + assert_eq!( + deposit_payloads, + vec![&[0xde, 0xad][..]], + "undrained deposit must land in the recovery batch's first frame", + ); + } + + #[test] + fn recovery_batch_opens_empty_when_no_direct_inputs_pending() { + // no drained-into-invalidated inputs AND no undrained safe + // inputs → recovery batch opens with an empty first frame (aside + // from the batch-submitter's own self-submission, which is drained + // but carries no user-visible payload). + let db = temp_db("recovery-empty-first-frame"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append stale batch submission"); + let invalidated = storage.recover_post_flush(1200).expect("recover"); + assert_eq!(invalidated, vec![0, 1]); + + let recovery = storage.open_state().expect("load").unwrap(); + let recovery_txs = storage + .ordered_l2_txs_for_batch(recovery.batch_index) + .expect("load recovery batch txs"); + let user_visible: Vec<_> = recovery_txs + .iter() + .filter(|tx| match tx { + SequencedL2Tx::Direct(d) => d.sender != batch_submitter, + SequencedL2Tx::UserOp(_) => true, + }) + .collect(); + assert!( + user_visible.is_empty(), + "recovery batch must have no deposits or user-ops when none were pending: {user_visible:?}", + ); + } + + #[test] + fn first_batch_stale_recovery_reuses_nonce_zero() { + // first-ever batch (nonce 0) goes stale before reaching + // Gold. Cascade invalidates it; recovery opens a fresh batch that + // reuses nonce 0 (no valid ancestor exists to advance the nonce). + let db = temp_db("first-batch-stale-nonce-zero"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0 (nonce 0)"); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append stale batch submission"); + let invalidated = storage.recover_post_flush(1200).expect("recover"); + assert_eq!( + invalidated, + vec![0, 1], + "closed batch 0 and open batch 1 must both invalidate", + ); + + let recovery = storage.open_state().expect("load").unwrap(); + assert_eq!(recovery.batch_index, 2, "batch_index is monotonic (PK)"); + drop(storage); + + // Read the new Tip's nonce and parent pointer via raw SQL — no + // public accessor surfaces them. + let conn = Storage::open_connection(db.path.as_str()).expect("open read conn"); + let recovery_i64 = recovery.batch_index as i64; + let nonce: i64 = conn + .query_row( + "SELECT nonce FROM batches WHERE batch_index = ?1", + [recovery_i64], + |row| row.get(0), + ) + .expect("query nonce"); + assert_eq!( + nonce, 0, + "recovery batch must reuse nonce 0 after torn cascade", + ); + let parent: Option = conn + .query_row( + "SELECT parent_batch_index FROM batches WHERE batch_index = ?1", + [recovery_i64], + |row| row.get(0), + ) + .expect("query parent"); + assert_eq!( + parent, None, + "torn recovery has no valid ancestor; parent_batch_index is NULL", + ); + } + + #[test] + fn after_post_recovery_crash_is_no_op() { + // simulate a crash AFTER open_recovery_batch has run. On + // restart, the state contains a valid open recovery batch (no stale + // tail remains). A fresh `recover_post_flush` call must be a no-op: + // no new invalidations, and the same recovery batch remains the Tip. + // + // Distinct from `is_idempotent` (idempotent back-to-back call on the + // same Storage handle): this test drops and reopens Storage to model + // a full restart over the persisted DB. + let db = temp_db("post-recovery-crash-idempotent"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append stale submission"); + // First call: full recovery runs to completion and opens a new Tip. + let invalidated = storage.recover_post_flush(1200).expect("recover"); + assert_eq!(invalidated, vec![0, 1]); + let recovery_index = storage + .open_state() + .expect("load open") + .expect("recovery batch exists") + .batch_index; + + // Simulate "crash immediately after open_recovery_batch" by + // dropping Storage (mimics process exit) and reopening against the + // same on-disk DB. + drop(storage); + let mut storage = Storage::open(db.path.as_str()).expect("reopen storage"); + + let second = storage.recover_post_flush(1200).expect("second detect"); + assert!( + second.is_empty(), + "post-recovery restart must be a no-op, got invalidations: {second:?}", + ); + let after = storage + .open_state() + .expect("load after restart") + .expect("recovery batch still Tip after restart"); + assert_eq!( + after.batch_index, recovery_index, + "the same recovery batch must remain the Tip after restart", + ); + } +} + +mod check_danger_zone { + use super::*; + + // ── check_danger_zone ────────────────────────────────────────────── + + #[test] + fn check_danger_zone_ignores_old_gold_batches() { + // Batch 0 is Gold (accepted, first_frame_safe_block=10). Batch 1 is + // the open tip at first_frame_safe_block=100. Advance safe head to + // 1200 so batch 0 is age=1190 > 1125 (past threshold, but it's Gold + // and therefore excluded) while batch 1 is age=1100 < 1125 (fresh). + // + // `check_danger_zone` must return None: no unresolved batch is in + // danger. Gold batches (accepted past the frontier) never participate, + // and the open tip isn't old enough to trip the threshold. + let db = temp_db("danger-zone-gold"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = Address::repeat_byte(0xAA); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + // Advance to a current safe block where batch 0 (safe_block=10) is + // past threshold (1200-10=1190>=1125) but batch 1 (safe_block=100) + // is still fresh (1200-100=1100<1125). + storage + .append_safe_inputs(1200, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe block"); + + let result = + find_closed_frontier_batch_in_danger(&storage.conn, 1125).expect("check danger zone"); + assert!( + result.is_none(), + "old Gold batches should not trigger danger zone; got batch_index={result:?}" + ); + } + + #[test] + fn check_danger_zone_does_not_flag_open_batch_zombie() { + // `check_danger_zone` is for zombie detection: it must NOT flag the + // open batch (which has no L1 tx to become a zombie). Flagging open + // batches here would put the live submitter into a shutdown/restart + // loop when an open batch ages into the danger zone without any + // pending wallet-nonce slots to flush. + // + // Scenario: only an open batch exists, aged past the danger + // threshold. `check_danger_zone` returns None. + let db = temp_db("danger-zone-open-no-zombie"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open batch at safe_block=10"); + + storage + .append_safe_inputs(1200, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head past danger threshold"); + + let result = + find_closed_frontier_batch_in_danger(&storage.conn, 1125).expect("check danger zone"); + assert!( + result.is_none(), + "open batch (no zombie) must not trigger check_danger_zone; got batch_index={result:?}" + ); + } +} + +mod check_any_unresolved { + use super::*; + + // ── check_any_unresolved_batch_in_danger ─────────────────────────────── + + #[test] + fn check_any_unresolved_flags_stale_open_batch() { + // Wall-clock fallback regression: `check_any_unresolved_batch_in_danger` + // MUST flag a stale open batch. This is the semantic the wall-clock + // fallback relies on — if L1 is unreachable and an open batch may be + // past the threshold, refuse to boot rather than accept user ops + // into a batch that can't land. + let db = temp_db("any-unresolved-open-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open batch at safe_block=10"); + + storage + .append_safe_inputs(1200, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head past threshold"); + + let result = find_first_batch_in_danger(&storage.conn, 1125) + .expect("check any unresolved in danger"); + assert_eq!( + result, + Some(0), + "stale open batch (batch 0) must be flagged by the unified check" + ); + } + + #[test] + fn check_any_unresolved_does_not_flag_fresh_open_batch() { + // Negative counterpart. Fresh open batch below threshold must not + // trigger false positives in the unified check. + let db = temp_db("any-unresolved-open-fresh"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open batch at safe_block=10"); + + storage + .append_safe_inputs(1100, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe head below threshold"); + + let result = find_first_batch_in_danger(&storage.conn, 1125) + .expect("check any unresolved in danger"); + assert!( + result.is_none(), + "fresh open batch must not trigger the unified check; got batch_index={result:?}" + ); + } + + #[test] + fn check_danger_zone_triggers_on_frontier_batch() { + let db = temp_db("danger-zone-frontier"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = Address::repeat_byte(0xAA); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + storage + .append_safe_inputs(1200, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe block"); + + let result = + find_closed_frontier_batch_in_danger(&storage.conn, 1125).expect("check danger zone"); + assert_eq!(result, Some(1), "frontier batch should trigger danger zone"); + } + + #[test] + fn check_danger_zone_does_not_trigger_below_threshold() { + let db = temp_db("danger-zone-below"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = Address::repeat_byte(0xAA); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + storage + .append_safe_inputs(1134, &[], SENDER_A, &default_protocol_timing()) + .expect("advance safe block"); + + let result = + find_closed_frontier_batch_in_danger(&storage.conn, 1125).expect("check danger zone"); + assert!( + result.is_none(), + "should not trigger below threshold; got batch_index={result:?}" + ); + } +} + +mod boundary { + use super::*; + + // ── boundary tests ───────────────────────────────────────────────── + + #[test] + fn boundary_exactly_max_wait_is_stale() { + let db = temp_db("detect-boundary-exact"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch"); + + storage + .append_safe_inputs( + 1300, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 100), + block_number: 1300, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + let invalidated = storage.recover_post_flush(1200).expect("detect"); + assert_eq!(invalidated, vec![0, 1], "exactly at max_wait must be stale"); + assert_eq!(storage.open_state().expect("load").unwrap().batch_index, 2); + } + + #[test] + fn boundary_one_below_max_wait_is_not_stale() { + let db = temp_db("detect-boundary-one-below"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch"); + + storage + .append_safe_inputs( + 1299, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 100), + block_number: 1299, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append safe input"); + let invalidated = storage.recover_post_flush(1200).expect("detect"); + assert!( + invalidated.is_empty(), + "one below max_wait must not be stale" + ); + } + + #[test] + fn all_batches_invalidated_frontier_zero() { + let db = temp_db("detect-frontier-zero"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..3 { + storage.close_frame_and_batch(&mut head, 10).expect("close"); + } + + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append"); + let inv = storage.recover_post_flush(1200).expect("detect"); + assert_eq!(inv, vec![0, 1, 2, 3]); + assert!(storage.open_state().expect("open").is_some()); + } + + #[test] + fn recovery_batch_itself_becomes_stale() { + let db = temp_db("detect-recovery-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append gen1"); + let inv1 = storage.recover_post_flush(1200).expect("recover gen1"); + assert_eq!(inv1, vec![0, 1]); + + let mut head2 = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head2, 1210) + .expect("close gen2"); + + storage + .append_safe_inputs( + 2410, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 1210), + block_number: 2410, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append gen2"); + let inv2 = storage.recover_post_flush(1200).expect("recover gen2"); + assert_eq!(inv2, vec![2, 3]); + assert!(storage.open_state().expect("open").is_some()); + } + + #[test] + fn multi_round_gen3_recovery() { + let db = temp_db("detect-gen3"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append"); + storage.recover_post_flush(1200).expect("recover gen1"); + + let mut head2 = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head2, 1210) + .expect("close gen2"); + storage + .append_safe_inputs( + 2410, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 1210), + block_number: 2410, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append gen2"); + storage.recover_post_flush(1200).expect("recover gen2"); + + let mut head3 = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head3, 2410) + .expect("close gen3"); + storage + .append_safe_inputs( + 2420, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 2410), + block_number: 2420, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append gen3"); + let inv3 = storage.recover_post_flush(1200).expect("recover gen3"); + assert!(inv3.is_empty(), "gen3 should be healthy"); + } + + #[test] + fn large_cascade_50_batches() { + let db = temp_db("detect-large-cascade"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..50 { + storage.close_frame_and_batch(&mut head, 10).expect("close"); + } + + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append"); + let inv = storage.recover_post_flush(1200).expect("detect"); + assert_eq!(inv.len(), 51); + } +} + +mod schema_invariants { + use super::*; + use rusqlite::params; + + // ── Schema-invariant regression tests ───────────────────────────────── + // + // These exercise the triggers + partial unique index in the schema + // directly. Each one checks a specific invariant that previously lived + // in writer discipline and now has a schema-level tripwire. + // + // They're here (rather than in a dedicated file) because they share the + // recovery tests' setup: same helpers, same fixture. Failures here mean + // the schema guard regressed, which is the whole point of making the + // invariants declarative. + + #[test] + fn schema_rejects_second_valid_tip() { + // The partial unique index `ux_single_valid_tip` catches a writer that + // opens a new Tip without sealing the old one first. + let db = temp_db("schema-second-tip"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + + // Try to bypass the lane and insert a second valid Tip directly. + let err = storage.conn.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms) \ + VALUES (99, 0, 1, 1000)", + [], + ); + let msg = format!("{err:?}"); + assert!( + msg.contains("UNIQUE constraint failed") && msg.contains("ux_single_valid_tip"), + "expected ux_single_valid_tip violation, got: {msg}" + ); + } + + #[test] + fn schema_rejects_bad_nonce_contiguity() { + // Nonce must equal parent.nonce + 1 — trigger enforces it. + // Insert the bad-nonce batch as already-sealed so it doesn't collide + // with the existing Tip on `ux_single_valid_tip`. + let db = temp_db("schema-bad-nonce"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0; batch 1 is now Tip"); + // Batch 1 has nonce 1 (0 + 1). Insert child with nonce 99 (should be 2). + let err = storage.conn.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms, sealed_at_ms) \ + VALUES (999, 1, 99, \ + (SELECT created_at_ms FROM batches WHERE batch_index = 1), \ + (SELECT created_at_ms FROM batches WHERE batch_index = 1))", + [], + ); + assert!( + format!("{err:?}").contains("batch nonce must equal parent.nonce + 1"), + "expected nonce trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_genesis_with_nonzero_nonce() { + let db = temp_db("schema-genesis-nonzero"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let err = storage.conn.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms) \ + VALUES (0, NULL, 7, 100)", + [], + ); + assert!( + format!("{err:?}").contains("genesis batch must have nonce 0"), + "expected genesis-nonce trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_re_seal() { + let db = temp_db("schema-re-seal"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0 (seals it)"); + // Batch 0 is sealed. Attempt to re-seal with a different timestamp. + let err = storage.conn.execute( + "UPDATE batches SET sealed_at_ms = sealed_at_ms + 1 WHERE batch_index = 0", + [], + ); + assert!( + format!("{err:?}").contains("sealed_at_ms is write-once"), + "expected write-once trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_re_invalidate() { + let db = temp_db("schema-re-invalidate"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + // Seed via test helper (uses now_unix_ms internally). + storage.insert_invalid_batch(0).expect("first invalidate"); + let err = storage.conn.execute( + "UPDATE batches SET invalidated_at_ms = invalidated_at_ms + 1 \ + WHERE batch_index = 0", + [], + ); + assert!( + format!("{err:?}").contains("invalidated_at_ms is write-once"), + "expected write-once trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_frame_insert_into_sealed_batch() { + // This is the bug class we've been fighting: writer holds a stale + // WriteHead and writes to a batch that's no longer the Tip. + let db = temp_db("schema-frame-into-sealed"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0; batch 0 is now sealed"); + // Batch 0 is sealed. Any direct insert into its frames must fail. + let err = storage.conn.execute( + "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) \ + VALUES (0, 1, 100, 1060, 0)", + [], + ); + assert!( + format!("{err:?}").contains("frames can only be inserted into the current Tip"), + "expected tip-only-frames trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_frame_insert_into_invalidated_batch() { + let db = temp_db("schema-frame-into-invalid"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + // Invalidate (without sealing) — Tip that never closed, now dead. + storage.insert_invalid_batch(0).expect("invalidate tip"); + let err = storage.conn.execute( + "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) \ + VALUES (0, 1, 100, 1060, 0)", + [], + ); + assert!( + format!("{err:?}").contains("frames can only be inserted into the current Tip"), + "expected tip-only-frames trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_parent_batch_index_mutation() { + let db = temp_db("schema-parent-immutable"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0"); + // Try to change parent of batch 1 — should be rejected. + let err = storage.conn.execute( + "UPDATE batches SET parent_batch_index = NULL WHERE batch_index = 1", + [], + ); + assert!( + format!("{err:?}").contains("parent_batch_index is immutable"), + "expected parent-immutable trigger, got: {err:?}" + ); + } + + #[test] + fn nonce_reuse_after_cascade_with_valid_ancestor() { + // Beautiful part of parent-pointer + structural nonce: after a cascade + // that invalidates only the suffix (keeping an ancestor valid), the + // new Tip's parent is the last valid ancestor, so its nonce is + // `ancestor.nonce + 1` — the same nonce the invalidated suffix's + // first batch had. Nonce reuse is automatic. + // + // Scenario: batch 0 is accepted (safe_accepted_batches advances past + // nonce 0). Batch 1 is stale and triggers cascade. Batches 1, 2, 3 + // invalidated; batch 0 remains valid. + let db = temp_db("nonce-reuse-with-ancestor"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = SENDER_A; + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize at safe_block=10"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0 (nonce 0)"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1 (nonce 1)"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 2 (nonce 2)"); + // Head is now batch 3 (nonce 3, first_frame_safe_block=100). + + // Batch 0 lands on L1 (accepted): safe_input at block 20 with nonce 0. + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append batch 0 submission"); + // Advance safe head so batches 1, 2, 3 (first_frame=100) are stale. + // current_safe=1400 → 1400-100=1300 >= 1200. + storage + .append_safe_inputs(1400, &[], SENDER_A, &default_protocol_timing()) + .expect("advance past threshold"); + + let inv = storage.recover_post_flush(1200).expect("recover"); + // Batches 1, 2, 3 invalidated; batch 0 (accepted) stays valid. + assert_eq!(inv, vec![1, 2, 3], "only the suffix cascades, got {inv:?}"); + + // The NEW Tip has parent=0 (the last valid ancestor), nonce=1. + // This is what nonce reuse looks like: the invalidated batch 1 had + // nonce 1; the recovery batch gets the same nonce via +1-from-parent. + let (tip_nonce, tip_parent): (i64, i64) = storage + .conn + .query_row( + "SELECT nonce, parent_batch_index FROM valid_open_batch", + [], + |row| Ok((row.get(0)?, row.get(1)?)), + ) + .expect("query recovery tip"); + assert_eq!(tip_nonce, 1, "recovery Tip reuses nonce 1"); + assert_eq!(tip_parent, 0, "recovery Tip's parent is batch 0"); + } + + // ── CHECK-constraint regressions ────────────────────────── + // + // These differ from the trigger-based tests above: they exercise raw + // `CHECK` clauses declared in `migrations/0001_schema.sql`. The + // type-safe `Storage` API would reject these values Rust-side; we go + // through `storage.conn.execute` to prove the schema itself refuses. + + #[test] + fn schema_rejects_safe_input_with_wrong_sender_length() { + // `safe_inputs.sender` must be exactly 20 bytes (an + // Ethereum address). A shorter or longer blob must be refused + // by the schema even if it bypasses the Rust API. + let db = temp_db("schema-safe-input-sender-len"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let err = storage.conn.execute( + "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) \ + VALUES (0, X'DEADBEEF', X'00', 10)", + [], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on safe_inputs.sender, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_user_op_with_wrong_sender_length() { + // `user_ops.sender` must be 20 bytes. + let db = temp_db("schema-user-op-sender-len"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + // Seed a frame to satisfy the composite FK — initialize_open_state + // creates batch 0 frame 0 as the Tip. + let mut storage = storage; + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let err = storage.conn.execute( + "INSERT INTO user_ops \ + (batch_index, frame_in_batch, pos_in_frame, sender, nonce, max_fee, data, sig, received_at_ms) \ + VALUES (0, 0, 0, X'010203', 0, 0, X'', ?1, 0)", + params![vec![0u8; 65]], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on user_ops.sender length, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_user_op_with_wrong_signature_length() { + // `user_ops.sig` must be exactly 65 bytes (secp256k1 + // r || s || v). Regression for "accidentally accepted a non-65 + // signature and crashed a downstream consumer." + let db = temp_db("schema-user-op-sig-len"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let valid_sender = vec![0u8; 20]; + let short_sig = vec![0u8; 32]; // Should be 65. + let err = storage.conn.execute( + "INSERT INTO user_ops \ + (batch_index, frame_in_batch, pos_in_frame, sender, nonce, max_fee, data, sig, received_at_ms) \ + VALUES (0, 0, 0, ?1, 0, 0, X'', ?2, 0)", + params![valid_sender, short_sig], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on user_ops.sig length, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_sequenced_l2_tx_with_neither_xor_branch() { + // `sequenced_l2_txs` must be either a user-op row + // (user_op_pos_in_frame IS NOT NULL) or a direct-input row + // (safe_input_index IS NOT NULL), never both and never neither. + // Setting both to NULL is the clean XOR violation to test — + // FKs are only triggered on non-NULL values so we isolate the + // CHECK constraint. + let db = temp_db("schema-sequenced-l2-tx-xor-neither"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let err = storage.conn.execute( + "INSERT INTO sequenced_l2_txs \ + (offset, batch_index, frame_in_batch, user_op_pos_in_frame, safe_input_index) \ + VALUES (0, 0, 0, NULL, NULL)", + [], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on sequenced_l2_txs XOR, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_deployment_identity_with_zero_chain_id() { + // `deployment_identity.chain_id > 0`. chain_id = 0 would + // collide with the EIP-712 domain's unspecified-chain sentinel + // and break signature recovery; the CHECK refuses to persist it + // in the first place. + let db = temp_db("schema-deployment-chain-id-zero"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let address = vec![0u8; 20]; + let err = storage.conn.execute( + "INSERT INTO deployment_identity \ + (singleton_id, chain_id, app_address, input_box_address, \ + input_box_genesis_block, batch_submitter_address) \ + VALUES (0, 0, ?1, ?1, 0, ?1)", + params![address], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on chain_id > 0, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_safe_input_with_negative_block_number() { + // `safe_inputs.block_number >= 0`. Catches a regression + // that would let a negative block number slip through — the rest + // of the system assumes non-negative and could panic on cast. + let db = temp_db("schema-safe-input-neg-block"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let sender = vec![0u8; 20]; + let err = storage.conn.execute( + "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) \ + VALUES (0, ?1, X'00', -1)", + params![sender], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on block_number >= 0, got: {err:?}", + ); + } +} + +mod tree_invariants { + use super::*; + + // ── Parent-pointer tree invariants ────────────────────────────── + use crate::storage::convert::{i64_to_u64, u64_to_i64}; + use rusqlite::params; + + /// Check the tree invariants that should hold at every quiescent state: + /// - Every valid batch has `nonce = parent.nonce + 1`, or `nonce = 0` + /// with `parent_batch_index IS NULL` (genesis/post-torn-cascade). + /// - Every `parent_batch_index` either is NULL or references an + /// existing batch (FK handles this, but we assert explicitly). + /// - Walking up `parent_batch_index` from any valid batch terminates + /// at a NULL-parent row within `batch_index` hops (no cycles). + /// - The valid path is strictly contiguous in `nonce`: the set of + /// nonces among valid batches is `{0, 1, ..., max_valid_nonce}`. + /// - At most one `valid_open_batch` row exists. + fn assert_tree_invariants(storage: &mut Storage) { + // 1. Nonce = parent.nonce + 1 (or nonce=0 for NULL parent). + let mut stmt = storage + .conn + .prepare( + "SELECT b.batch_index, b.parent_batch_index, b.nonce, p.nonce \ + FROM batches b LEFT JOIN batches p ON p.batch_index = b.parent_batch_index", + ) + .expect("prepare"); + let rows: Vec<(i64, Option, i64, Option)> = stmt + .query_map([], |row| { + Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) + }) + .expect("query") + .collect::>() + .expect("collect"); + drop(stmt); + for (bi, parent, nonce, parent_nonce) in &rows { + match (parent, parent_nonce) { + (None, _) => assert_eq!( + *nonce, 0, + "batch {bi}: NULL parent must have nonce 0, got {nonce}" + ), + (Some(_), None) => panic!("batch {bi}: parent exists but parent row missing"), + (Some(_), Some(pn)) => assert_eq!( + *nonce, + pn + 1, + "batch {bi}: nonce={nonce}, expected parent.nonce+1 = {}", + pn + 1 + ), + } + } + + // 2. At most one valid open batch. + let open_count: i64 = storage + .conn + .query_row("SELECT COUNT(*) FROM valid_open_batch", [], |row| { + row.get(0) + }) + .expect("count open"); + assert!(open_count <= 1, "more than one valid Tip: {open_count}"); + + // 3. Valid-path nonce contiguity: nonces on the valid chain are 0..N. + let mut valid_nonces: Vec = storage + .conn + .prepare("SELECT nonce FROM valid_batches ORDER BY nonce ASC") + .expect("prepare") + .query_map([], |row| row.get::<_, i64>(0)) + .expect("query") + .collect::>() + .expect("collect"); + // There can be multiple valid batches with the SAME nonce only if + // they live on different branches — but we don't allow that; valid + // batches form a strict chain. So dedup-and-equal means contiguous. + valid_nonces.sort(); + valid_nonces.dedup(); + for (i, &n) in valid_nonces.iter().enumerate() { + assert_eq!( + n, i as i64, + "valid nonces not contiguous: got {valid_nonces:?}" + ); + } + + // 4. Parent walk terminates at NULL in ≤ batch_index hops for every valid row. + for (bi, _, _, _) in &rows { + let mut cur: i64 = *bi; + let bi_u = i64_to_u64(*bi); + for _ in 0..=bi_u { + let parent: Option = storage + .conn + .query_row( + "SELECT parent_batch_index FROM batches WHERE batch_index = ?1", + params![cur], + |row| row.get(0), + ) + .expect("parent lookup"); + match parent { + None => break, + Some(p) => { + assert!( + p < cur, + "batch {bi}: parent-walk went backward ({p} >= {cur}) — cycle?" + ); + cur = p; + } + } + } + } + } + + #[test] + fn tree_invariants_hold_across_mixed_workload() { + // Exercises every mutating code path: genesis, rotations, partial + // cascades (ancestor survives), cascades across accepted frontier, + // torn cascades (no valid ancestor), and back-to-back generations. + // Asserts tree invariants after each step. + let db = temp_db("tree-invariants-workload"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = SENDER_A; + + // Phase 1: genesis + 4 rotations. Simple chain. + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + assert_tree_invariants(&mut storage); + for _ in 0..4 { + storage + .close_frame_and_batch(&mut head, 100) + .expect("close"); + assert_tree_invariants(&mut storage); + } + // Tree: 0(Gold sentinel in concept)→1→2→3→4 (Tip) + + // Phase 2: cascade with a valid ancestor. Batch 0 is accepted first. + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append accepted"); + storage + .append_safe_inputs(1400, &[], SENDER_A, &default_protocol_timing()) + .expect("advance past threshold"); + let inv = storage.recover_post_flush(1200).expect("recover"); + assert!(!inv.is_empty(), "partial cascade should invalidate"); + assert_tree_invariants(&mut storage); + + // Phase 3: more rotations after partial cascade. + let mut head = storage.open_state().expect("load").unwrap(); + for _ in 0..3 { + storage + .close_frame_and_batch(&mut head, 1500) + .expect("close gen2"); + assert_tree_invariants(&mut storage); + } + + // Phase 4: torn cascade — invalidate everything including batch 0. + let latest = storage.latest_batch_index().expect("latest").unwrap(); + for bi in 0..=latest { + storage.insert_invalid_batch(bi).expect("invalidate"); + } + storage.recover_post_flush(1200).expect("recover from torn"); + assert_tree_invariants(&mut storage); + + // Phase 5: rotations after torn cascade — new Tip has parent=NULL, nonce=0. + let mut head = storage.open_state().expect("load").unwrap(); + for _ in 0..5 { + storage + .close_frame_and_batch(&mut head, 2000) + .expect("close gen3"); + assert_tree_invariants(&mut storage); + } + } + + #[test] + fn subtree_by_batch_index_equals_subtree_by_parent_walk() { + // cascade queries use `batch_index >= N` as a shortcut for + // "subtree rooted at N". This test asserts the equivalence on a + // realistic scenario with multiple cascade generations. + let db = temp_db("subtree-equivalence"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = SENDER_A; + + // Build: 5 batches, cascade from 2 (partial), 3 more, cascade from 1 (torn-ish). + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..4 { + storage + .close_frame_and_batch(&mut head, 100) + .expect("close"); + } + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + SENDER_A, + &default_protocol_timing(), + ) + .expect("append accepted"); + storage + .append_safe_inputs(1400, &[], SENDER_A, &default_protocol_timing()) + .expect("advance"); + let _ = storage.recover_post_flush(1200).expect("cascade 1"); + + let mut head = storage.open_state().expect("load").unwrap(); + for _ in 0..2 { + storage + .close_frame_and_batch(&mut head, 1500) + .expect("close"); + } + + // Assert equivalence among VALID batches for every valid N. + // Restricting both sides to `valid_batches` is the invariant cascade + // relies on: its WHERE filters invalidated rows, so the two sets need + // only agree on the valid subset. + let valid_bi: Vec = { + let mut stmt = storage + .conn + .prepare("SELECT batch_index FROM valid_batches ORDER BY batch_index") + .expect("prepare"); + stmt.query_map([], |row| row.get::<_, i64>(0).map(i64_to_u64)) + .expect("query") + .collect::>() + .expect("collect") + }; + for &n in &valid_bi { + let by_index: Vec = { + let mut stmt = storage + .conn + .prepare( + "SELECT batch_index FROM valid_batches \ + WHERE batch_index >= ?1 ORDER BY batch_index", + ) + .expect("prepare"); + stmt.query_map(params![u64_to_i64(n)], |row| { + row.get::<_, i64>(0).map(i64_to_u64) + }) + .expect("query") + .collect::>() + .expect("collect") + }; + let by_subtree: Vec = { + let mut stmt = storage + .conn + .prepare( + "WITH RECURSIVE subtree(batch_index) AS ( \ + SELECT batch_index FROM valid_batches WHERE batch_index = ?1 \ + UNION ALL \ + SELECT b.batch_index FROM valid_batches b \ + JOIN subtree s ON b.parent_batch_index = s.batch_index \ + ) \ + SELECT batch_index FROM subtree ORDER BY batch_index", + ) + .expect("prepare"); + stmt.query_map(params![u64_to_i64(n)], |row| { + row.get::<_, i64>(0).map(i64_to_u64) + }) + .expect("query") + .collect::>() + .expect("collect") + }; + assert_eq!( + by_index, by_subtree, + "cascade root {n}: valid batch_index >= N diverged from valid parent-walk subtree" + ); + } + } +} diff --git a/sequencer/src/storage/safe_accepted_batches.rs b/sequencer/src/storage/safe_accepted_batches.rs new file mode 100644 index 0000000..3ec5f40 --- /dev/null +++ b/sequencer/src/storage/safe_accepted_batches.rs @@ -0,0 +1,160 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Materialized view of the scheduler-accepted batches. +//! +//! `safe_accepted_batches` caches the prefix of submitted batches that the +//! on-chain scheduler would accept, based on an off-chain simulation of its +//! acceptance rules (see [`sequencer_core::protocol::ProtocolTiming`]). +//! +//! Maintenance contract: the view is advanced atomically with each +//! [`super::Storage::append_safe_inputs`] write, so any reader that sees +//! `l1_safe_head` at block B also sees every acceptance decision up to B. No +//! caller should populate this view directly. +//! +//! Readers: +//! - batch submitter frontier / danger reads (`submitter_frontier`, +//! `check_danger`) +//! - recovery cascade (`find_closed_frontier_batch_in_danger`) +//! - wall-clock and stalled-safe-head danger estimates +//! +//! The only writer is [`populate_safe_accepted_batches`], invoked from +//! `append_safe_inputs` inside its transaction. + +use alloy_primitives::Address; +use rusqlite::{Connection, OptionalExtension, Result, params}; + +use super::convert::{i64_to_u64, u64_to_i64}; +use sequencer_core::protocol::{ProtocolTiming, SafeInputView}; + +/// One row of `safe_accepted_batches`, exposing just the columns the +/// frontier-read code paths need. +#[derive(Debug, Clone, Copy)] +pub(super) struct SafeAcceptedBatchRow { + pub safe_input_index: i64, + pub nonce: i64, +} + +/// The most recently accepted row, or `None` if the view is empty. +pub(super) fn query_latest_safe_accepted_batch( + conn: &Connection, +) -> Result> { + conn.query_row( + "SELECT safe_input_index, nonce FROM safe_accepted_batches \ + ORDER BY safe_input_index DESC LIMIT 1", + [], + |row| { + Ok(SafeAcceptedBatchRow { + safe_input_index: row.get(0)?, + nonce: row.get(1)?, + }) + }, + ) + .optional() +} + +/// Next nonce the scheduler is expected to accept — the gold frontier's +/// "expected next" cursor. +/// +/// Returns `latest_accepted.nonce + 1` if any batch has been accepted, else `0`. +/// Equivalently, the nonce that the very next valid closed batch (the cascade +/// pivot, when one exists) will carry, by the contiguity invariant on the +/// valid path (`trg_enforce_nonce_contiguity`). +pub(super) fn frontier_nonce(conn: &Connection) -> Result { + Ok(query_latest_safe_accepted_batch(conn)? + .map(|row| i64_to_u64(row.nonce).saturating_add(1)) + .unwrap_or(0)) +} + +/// Simulate the scheduler's acceptance logic over new safe inputs and append +/// matches to `safe_accepted_batches`. +/// +/// Paginates through `safe_inputs` rows newer than the latest accepted row, +/// pre-filtered at SQL to `batch_submitter` as the sender. For each row, +/// delegates to [`ProtocolTiming::scheduler_accepts`] with the +/// currently-expected nonce — on `Some`, inserts the accepted row and advances +/// expected; on `None`, moves on. The SQL sender filter is an optimization; +/// `scheduler_accepts` re-checks defensively, so the filter is +/// correctness-neutral. +/// +/// `batch_submitter` and `timing` arrive separately because they're +/// orthogonal: the address is identity (who the scheduler accepts from); +/// the timing is the scheduler's staleness rules. +/// +/// The scan cursor is local to one invocation. Persistently, the only cursor +/// is the latest accepted row in `safe_accepted_batches`, not the latest row +/// scanned. That is intentional for now: a recovery batch may reuse the same +/// scheduler nonce after earlier rejected rows, and a too-eager persistent +/// scan cursor would risk skipping it. The tradeoff is that rejected +/// batch-submitter inputs after the gold frontier can be rescanned on later +/// safe-head syncs until a later batch is accepted and moves the accepted +/// cursor forward. +pub(super) fn populate_safe_accepted_batches( + conn: &Connection, + batch_submitter: Address, + timing: &ProtocolTiming, +) -> Result<()> { + const PAGE_SIZE: i64 = 256; + const SELECT_SQL: &str = "SELECT safe_input_index, payload, block_number \ + FROM safe_inputs \ + WHERE sender = ?1 AND safe_input_index > ?2 \ + ORDER BY safe_input_index ASC LIMIT ?3"; + const INSERT_SQL: &str = "INSERT OR IGNORE INTO safe_accepted_batches \ + (safe_input_index, nonce, first_frame_safe_block, inclusion_block) \ + VALUES (?1, ?2, ?3, ?4)"; + + let latest_accepted = query_latest_safe_accepted_batch(conn)?; + let mut cursor = latest_accepted + .map(|row| row.safe_input_index) + .unwrap_or(-1); + let mut expected = latest_accepted + .map(|row| i64_to_u64(row.nonce).saturating_add(1)) + .unwrap_or(0); + + loop { + // Materialize one page before executing any INSERTs. rusqlite's row + // iterator borrows the prepared statement, so we can't INSERT on the + // same connection while iterating. Once the page is collected and the + // statement is dropped, the connection is free for inserts. + let page: Vec<(i64, Vec, i64)> = { + let mut stmt = conn.prepare_cached(SELECT_SQL)?; + stmt.query_map( + params![batch_submitter.as_slice(), cursor, PAGE_SIZE,], + |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), + )? + .collect::>()? + }; + + if page.is_empty() { + break; + } + let page_len = page.len() as i64; + + let mut insert_stmt = conn.prepare_cached(INSERT_SQL)?; + for (safe_input_index, payload, block_number) in &page { + cursor = *safe_input_index; + let input = SafeInputView { + safe_input_index: i64_to_u64(*safe_input_index), + sender: batch_submitter, + payload: payload.as_slice(), + inclusion_block: i64_to_u64(*block_number), + }; + let Some(accepted) = timing.scheduler_accepts(batch_submitter, input, expected) else { + continue; + }; + insert_stmt.execute(params![ + u64_to_i64(accepted.safe_input_index), + u64_to_i64(accepted.nonce), + u64_to_i64(accepted.first_frame_safe_block), + u64_to_i64(accepted.inclusion_block), + ])?; + expected = expected.saturating_add(1); + } + + if page_len < PAGE_SIZE { + break; + } + } + + Ok(()) +} diff --git a/sequencer/src/storage/sql.rs b/sequencer/src/storage/sql.rs deleted file mode 100644 index 556fdbb..0000000 --- a/sequencer/src/storage/sql.rs +++ /dev/null @@ -1,836 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use rusqlite::{Connection, Result, Row, Transaction, params}; -use std::time::{SystemTime, UNIX_EPOCH}; - -use super::{SafeInputRange, StoredSafeInput}; -use crate::inclusion_lane::PendingUserOp; - -const SQL_SELECT_SAFE_INPUTS_RANGE: &str = include_str!("queries/select_safe_inputs_range.sql"); -const SQL_SELECT_ORDERED_L2_TXS_FROM_OFFSET: &str = - include_str!("queries/select_ordered_l2_txs_from_offset.sql"); -const SQL_SELECT_ORDERED_L2_TXS_PAGE_FROM_OFFSET: &str = - include_str!("queries/select_ordered_l2_txs_page_from_offset.sql"); -const SQL_SELECT_LATEST_BATCH_WITH_USER_OP_COUNT: &str = - include_str!("queries/select_latest_batch_with_user_op_count.sql"); -const SQL_SELECT_LATEST_FRAME_IN_BATCH_FOR_BATCH: &str = - include_str!("queries/select_latest_frame_in_batch_for_batch.sql"); -const SQL_SELECT_USER_OP_COUNT_FOR_FRAME: &str = - include_str!("queries/select_user_op_count_for_frame.sql"); -const SQL_SELECT_ORDERED_L2_TXS_FOR_BATCH: &str = - include_str!("queries/select_ordered_l2_txs_for_batch.sql"); -const SQL_SELECT_LATEST_BATCH_INDEX: &str = "SELECT MAX(batch_index) FROM batches"; -const SQL_SELECT_USER_OPS_FOR_FRAME: &str = "SELECT nonce, max_fee, data, sig FROM user_ops WHERE batch_index = ?1 AND frame_in_batch = ?2 ORDER BY pos_in_frame ASC"; -const SQL_SELECT_MAX_SAFE_INPUT_INDEX: &str = "SELECT MAX(safe_input_index) FROM safe_inputs"; -const SQL_SELECT_ORDERED_L2_TX_COUNT: &str = "SELECT COUNT(*) FROM sequenced_l2_txs"; -const SQL_SELECT_BATCH_POLICY: &str = "SELECT log_recommended_fee, log_batch_size_target FROM batch_policy_derived WHERE singleton_id = 0 LIMIT 1"; -const SQL_SELECT_SAFE_BLOCK: &str = - "SELECT block_number FROM l1_safe_head WHERE singleton_id = 0 LIMIT 1"; -const SQL_INSERT_SAFE_INPUT: &str = "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) VALUES (?1, ?2, ?3, ?4)"; -const SQL_INSERT_USER_OP: &str = include_str!("queries/insert_user_op.sql"); -const SQL_INSERT_SEQUENCED_DIRECT_INPUT: &str = - include_str!("queries/insert_sequenced_direct_input.sql"); -const SQL_UPDATE_BATCH_POLICY_LOG_GAS_PRICE: &str = - "UPDATE batch_policy SET log_gas_price = ?1 WHERE singleton_id = 0"; -const SQL_UPDATE_BATCH_POLICY_ALPHA: &str = - "UPDATE batch_policy SET log_alpha = ?1, log_one_plus_alpha = ?2 WHERE singleton_id = 0"; -const SQL_UPDATE_SAFE_BLOCK: &str = - "UPDATE l1_safe_head SET block_number = ?1 WHERE singleton_id = 0"; -#[derive(Debug, Clone)] -pub(super) struct OrderedL2TxRow { - pub kind: i64, - pub sender: Option>, - pub data: Option>, - pub fee: Option, - pub payload: Option>, - pub block_number: Option, -} - -#[derive(Debug, Clone)] -pub(super) struct SafeInputRow { - pub safe_input_index: i64, - pub sender: Vec, - pub payload: Vec, - pub block_number: i64, -} - -#[derive(Debug, Clone)] -pub(super) struct FrameHeaderRow { - pub frame_in_batch: i64, - pub fee: i64, - pub safe_block: i64, -} - -#[derive(Debug, Clone)] -pub(super) struct FrameUserOpRow { - pub nonce: i64, - pub max_fee: i64, - pub data: Vec, - pub sig: Vec, -} - -pub(super) fn sql_select_total_drained_direct_inputs(conn: &Connection) -> Result { - const SQL: &str = "SELECT COUNT(*) FROM sequenced_l2_txs WHERE safe_input_index IS NOT NULL"; - conn.query_row(SQL, [], |row| row.get(0)) -} - -pub(super) fn sql_select_max_safe_input_index(conn: &Connection) -> Result> { - conn.query_row( - SQL_SELECT_MAX_SAFE_INPUT_INDEX, - [], - convert_row_to_optional_i64, - ) -} - -pub(super) fn sql_select_latest_batch_index(conn: &Connection) -> Result> { - conn.query_row( - SQL_SELECT_LATEST_BATCH_INDEX, - [], - convert_row_to_optional_i64, - ) -} - -/// Derived batch policy: (log_recommended_fee, log_batch_size_target). -pub(super) fn sql_select_batch_policy(conn: &Connection) -> Result<(i64, i64)> { - conn.query_row(SQL_SELECT_BATCH_POLICY, [], |row| { - Ok((row.get(0)?, row.get(1)?)) - }) -} - -pub(super) fn sql_update_batch_policy_log_gas_price( - conn: &Connection, - log_gas_price: i64, -) -> Result { - conn.execute( - SQL_UPDATE_BATCH_POLICY_LOG_GAS_PRICE, - params![log_gas_price], - ) -} - -pub(super) fn sql_update_batch_policy_alpha( - conn: &Connection, - log_alpha: i64, - log_one_plus_alpha: i64, -) -> Result { - conn.execute( - SQL_UPDATE_BATCH_POLICY_ALPHA, - params![log_alpha, log_one_plus_alpha], - ) -} - -pub(super) fn sql_select_safe_block(conn: &Connection) -> Result { - conn.query_row(SQL_SELECT_SAFE_BLOCK, [], |row| row.get(0)) -} - -pub(super) fn sql_update_safe_block(conn: &Connection, safe_block: i64) -> Result { - conn.execute(SQL_UPDATE_SAFE_BLOCK, params![safe_block]) -} - -pub(super) fn sql_select_safe_inputs_range( - conn: &Connection, - from_inclusive: i64, - to_exclusive: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_SAFE_INPUTS_RANGE)?; - let mapped = stmt.query_map( - params![from_inclusive, to_exclusive], - convert_row_to_safe_input_row, - )?; - mapped.collect() -} - -pub(super) fn sql_select_frames_for_batch( - conn: &Connection, - batch_index: i64, -) -> Result> { - const SQL: &str = "SELECT frame_in_batch, fee, safe_block FROM frames WHERE batch_index = ?1 ORDER BY frame_in_batch ASC"; - let mut stmt = conn.prepare_cached(SQL)?; - let mapped = stmt.query_map(params![batch_index], convert_row_to_frame_header_row)?; - mapped.collect() -} - -pub(super) fn sql_select_user_ops_for_frame( - conn: &Connection, - batch_index: i64, - frame_in_batch: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_USER_OPS_FOR_FRAME)?; - let mapped = stmt.query_map( - params![batch_index, frame_in_batch], - convert_row_to_frame_user_op_row, - )?; - mapped.collect() -} - -pub(super) fn sql_insert_safe_inputs_batch( - tx: &Transaction<'_>, - start_index: u64, - safe_inputs: &[StoredSafeInput], -) -> Result<()> { - if safe_inputs.is_empty() { - return Ok(()); - } - - let mut stmt = tx.prepare_cached(SQL_INSERT_SAFE_INPUT)?; - for (offset, input) in safe_inputs.iter().enumerate() { - stmt.execute(params![ - u64_to_i64(start_index.saturating_add(offset as u64)), - input.sender.as_slice(), - input.payload.as_slice(), - u64_to_i64(input.block_number) - ])?; - } - Ok(()) -} - -/// Insert user-ops into the `user_ops` table. -/// The `trg_sequence_user_op` trigger automatically appends a corresponding row -/// to `sequenced_l2_txs` for each inserted user-op. -pub(super) fn sql_insert_user_ops_batch( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, - frame_pos_start: u32, - user_ops: &[PendingUserOp], -) -> Result<()> { - if user_ops.is_empty() { - return Ok(()); - } - - let mut stmt = tx.prepare_cached(SQL_INSERT_USER_OP)?; - for (offset, item) in user_ops.iter().enumerate() { - let pos_in_frame = frame_pos_start.saturating_add(offset as u32); - let sig = item.signed.signature.as_bytes(); - stmt.execute(params![ - batch_index, - frame_in_batch, - i64::from(pos_in_frame), - item.signed.sender.as_slice(), - i64::from(item.signed.user_op.nonce), - i64::from(item.signed.user_op.max_fee), - item.signed.user_op.data.as_ref(), - &sig[..], - to_unix_ms(item.received_at), - ])?; - } - Ok(()) -} - -pub(super) fn sql_insert_sequenced_direct_inputs( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, - direct_range: SafeInputRange, -) -> Result<()> { - if direct_range.is_empty() { - return Ok(()); - } - - let mut stmt = tx.prepare_cached(SQL_INSERT_SEQUENCED_DIRECT_INPUT)?; - for safe_input_index in direct_range.start_inclusive..direct_range.end_exclusive { - stmt.execute(params![ - batch_index, - frame_in_batch, - u64_to_i64(safe_input_index), - ])?; - } - Ok(()) -} - -pub(super) fn sql_select_ordered_l2_txs_from_offset( - conn: &Connection, - offset: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_ORDERED_L2_TXS_FROM_OFFSET)?; - let mapped = stmt.query_map(params![offset], convert_row_to_ordered_l2_tx_row)?; - mapped.collect() -} - -pub(super) fn sql_select_ordered_l2_txs_for_batch( - conn: &Connection, - batch_index: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_ORDERED_L2_TXS_FOR_BATCH)?; - let mapped = stmt.query_map(params![batch_index], convert_row_to_ordered_l2_tx_row)?; - mapped.collect() -} - -pub(super) fn sql_select_ordered_l2_txs_page_from_offset( - conn: &Connection, - offset: i64, - limit: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_ORDERED_L2_TXS_PAGE_FROM_OFFSET)?; - let mapped = stmt.query_map(params![offset, limit], convert_row_to_ordered_l2_tx_row)?; - mapped.collect() -} - -pub(super) fn sql_select_ordered_l2_tx_count(conn: &Connection) -> Result { - conn.query_row(SQL_SELECT_ORDERED_L2_TX_COUNT, [], |row| row.get(0)) -} - -pub(super) fn sql_select_latest_batch_with_user_op_count( - tx: &Transaction<'_>, -) -> Result<(i64, i64, i64)> { - tx.query_row( - SQL_SELECT_LATEST_BATCH_WITH_USER_OP_COUNT, - [], - convert_row_to_latest_batch_with_user_op_count, - ) -} - -pub(super) fn sql_select_latest_frame_in_batch_for_batch( - tx: &Transaction<'_>, - batch_index: i64, -) -> Result<(i64, i64, i64)> { - tx.query_row( - SQL_SELECT_LATEST_FRAME_IN_BATCH_FOR_BATCH, - params![batch_index], - |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), - ) -} - -pub(super) fn sql_count_user_ops_for_frame( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, -) -> Result { - tx.query_row( - SQL_SELECT_USER_OP_COUNT_FOR_FRAME, - params![batch_index, frame_in_batch], - |row| row.get(0), - ) -} - -pub(super) fn sql_insert_open_batch(tx: &Transaction<'_>, created_at_ms: i64) -> Result { - const SQL: &str = "INSERT INTO batches (created_at_ms) VALUES (?1)"; - tx.execute(SQL, params![created_at_ms]) -} - -pub(super) fn sql_insert_open_batch_with_index( - tx: &Transaction<'_>, - batch_index: i64, - created_at_ms: i64, -) -> Result { - const SQL: &str = "INSERT INTO batches (batch_index, created_at_ms) VALUES (?1, ?2)"; - tx.execute(SQL, params![batch_index, created_at_ms]) -} - -pub(super) fn sql_insert_open_frame( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, - created_at_ms: i64, - fee: i64, - safe_block: i64, -) -> Result { - const SQL: &str = "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) VALUES (?1, ?2, ?3, ?4, ?5)"; - tx.execute( - SQL, - params![batch_index, frame_in_batch, created_at_ms, fee, safe_block], - ) -} - -fn convert_row_to_optional_i64(row: &Row<'_>) -> Result> { - row.get(0) -} - -fn convert_row_to_safe_input_row(row: &Row<'_>) -> Result { - Ok(SafeInputRow { - safe_input_index: row.get(0)?, - sender: row.get(1)?, - payload: row.get(2)?, - block_number: row.get(3)?, - }) -} - -fn convert_row_to_frame_header_row(row: &Row<'_>) -> Result { - Ok(FrameHeaderRow { - frame_in_batch: row.get(0)?, - fee: row.get(1)?, - safe_block: row.get(2)?, - }) -} - -fn convert_row_to_frame_user_op_row(row: &Row<'_>) -> Result { - Ok(FrameUserOpRow { - nonce: row.get(0)?, - max_fee: row.get(1)?, - data: row.get(2)?, - sig: row.get(3)?, - }) -} - -fn convert_row_to_ordered_l2_tx_row(row: &Row<'_>) -> Result { - Ok(OrderedL2TxRow { - kind: row.get(0)?, - sender: row.get(1)?, - data: row.get(2)?, - fee: row.get(3)?, - payload: row.get(4)?, - block_number: row.get(5)?, - }) -} - -fn convert_row_to_latest_batch_with_user_op_count(row: &Row<'_>) -> Result<(i64, i64, i64)> { - Ok((row.get(0)?, row.get(1)?, row.get(2)?)) -} - -fn to_unix_ms(time: SystemTime) -> i64 { - time.duration_since(UNIX_EPOCH) - .unwrap_or_default() - .as_millis() - .try_into() - .unwrap_or(i64::MAX) -} - -fn u64_to_i64(value: u64) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -#[cfg(test)] -mod tests { - use super::{ - FrameHeaderRow, SQL_INSERT_SAFE_INPUT, SQL_INSERT_SEQUENCED_DIRECT_INPUT, - SQL_INSERT_USER_OP, sql_insert_open_batch, sql_insert_open_batch_with_index, - sql_insert_open_frame, sql_insert_safe_inputs_batch, sql_insert_sequenced_direct_inputs, - sql_insert_user_ops_batch, sql_select_batch_policy, sql_select_frames_for_batch, - sql_select_latest_batch_index, sql_select_latest_batch_with_user_op_count, - sql_select_max_safe_input_index, sql_select_ordered_l2_tx_count, - sql_select_ordered_l2_txs_from_offset, sql_select_ordered_l2_txs_page_from_offset, - sql_select_safe_block, sql_select_safe_inputs_range, - sql_select_total_drained_direct_inputs, sql_select_user_ops_for_frame, - sql_update_batch_policy_alpha, sql_update_batch_policy_log_gas_price, - sql_update_safe_block, - }; - use crate::inclusion_lane::PendingUserOp; - use crate::storage::db::Storage; - use crate::storage::{SafeInputRange, StoredSafeInput}; - use alloy_primitives::{Address, Signature}; - use rusqlite::{Connection, params}; - use sequencer_core::user_op::{SignedUserOp, UserOp}; - use std::time::SystemTime; - use tokio::sync::oneshot; - - fn setup_conn() -> Connection { - let mut conn = Connection::open_in_memory().expect("open in-memory sqlite"); - Storage::run_migrations(&mut conn).expect("run migrations"); - conn - } - - fn sample_pending_user_op(seed: u8, nonce: u32, max_fee: u16) -> PendingUserOp { - let sender = Address::from_slice(&[seed; 20]); - let signature = Signature::test_signature(); - let (respond_to, _recv) = oneshot::channel(); - PendingUserOp { - signed: SignedUserOp { - sender, - signature, - user_op: UserOp { - nonce, - max_fee, - data: vec![seed].into(), - }, - }, - respond_to, - received_at: SystemTime::now(), - } - } - - fn seed_open_batch0_frame0(conn: &mut Connection) { - let tx = conn.transaction().expect("start tx"); - sql_insert_open_batch_with_index(&tx, 0, 123).expect("insert batch 0"); - sql_insert_open_frame(&tx, 0, 0, 123, 0, 0).expect("insert frame 0"); - tx.commit().expect("commit tx"); - } - - #[test] - fn max_index_helpers_work_for_empty_and_non_empty_tables() { - let mut conn = setup_conn(); - - assert_eq!( - sql_select_total_drained_direct_inputs(&conn).expect("total drained"), - 0 - ); - assert_eq!( - sql_select_max_safe_input_index(&conn).expect("query max direct input"), - None - ); - - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![0_i64, vec![0x11_u8; 20], vec![0xaa_u8], 10_i64], - ) - .expect("insert direct input 0"); - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![1_i64, vec![0x22_u8; 20], vec![0xbb_u8], 11_i64], - ) - .expect("insert direct input 1"); - assert_eq!( - sql_select_max_safe_input_index(&conn).expect("query max direct input"), - Some(1) - ); - - seed_open_batch0_frame0(&mut conn); - let tx = conn.transaction().expect("start tx"); - tx.execute( - SQL_INSERT_SEQUENCED_DIRECT_INPUT, - params![0_i64, 0_i64, 0_i64], - ) - .expect("insert sequenced direct input"); - tx.commit().expect("commit tx"); - - assert_eq!( - sql_select_total_drained_direct_inputs(&conn).expect("total drained"), - 1 - ); - - let tx = conn.transaction().expect("start tx"); - assert_eq!( - sql_select_max_safe_input_index(&tx).expect("query max direct input in tx"), - Some(1) - ); - } - - #[test] - fn safe_inputs_range_is_half_open_and_ordered() { - let conn = setup_conn(); - - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![0_i64, vec![0x11_u8; 20], vec![0xaa_u8], 10_i64], - ) - .expect("insert direct input 0"); - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![1_i64, vec![0x22_u8; 20], vec![0xbb_u8], 11_i64], - ) - .expect("insert direct input 1"); - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![2_i64, vec![0x33_u8; 20], vec![0xcc_u8], 12_i64], - ) - .expect("insert direct input 2"); - - let empty = sql_select_safe_inputs_range(&conn, 1, 1).expect("query empty interval"); - assert!(empty.is_empty()); - - let rows = sql_select_safe_inputs_range(&conn, 0, 2).expect("query non-empty interval"); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0].safe_input_index, 0); - assert_eq!(rows[1].safe_input_index, 1); - } - - #[test] - fn ordered_l2_query_follows_sequenced_offset_order() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 0_i64, - vec![0x20_u8; 20], - 0_i64, - 1_i64, - vec![0x30_u8], - vec![0x40_u8; 65], - 0_i64 - ], - ) - .expect("insert user op"); - // The trg_sequence_user_op trigger automatically inserts the sequenced row. - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![0_i64, vec![0x11_u8; 20], vec![0xaa_u8], 10_i64], - ) - .expect("insert direct input"); - conn.execute( - SQL_INSERT_SEQUENCED_DIRECT_INPUT, - params![0_i64, 0_i64, 0_i64], - ) - .expect("insert sequenced direct input"); - - let rows = sql_select_ordered_l2_txs_from_offset(&conn, 0).expect("query ordered l2"); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0].kind, 0); - assert_eq!(rows[0].fee, Some(0)); - assert_eq!(rows[1].kind, 1); - assert_eq!(rows[1].fee, None); - - let paged = sql_select_ordered_l2_txs_page_from_offset(&conn, 1, 1).expect("query page"); - assert_eq!(paged.len(), 1); - assert_eq!(paged[0].kind, 1); - assert_eq!( - sql_select_ordered_l2_tx_count(&conn).expect("query ordered count"), - 2 - ); - } - - #[test] - fn batch_and_frame_helpers_start_empty_before_lane_initialization() { - let mut conn = setup_conn(); - let tx = conn.transaction().expect("start tx"); - - let err = sql_select_latest_batch_with_user_op_count(&tx).expect_err("no batch yet"); - assert!(matches!(err, rusqlite::Error::QueryReturnedNoRows)); - } - - #[test] - fn latest_batch_index_and_frames_for_batch_helpers_work() { - let mut conn = setup_conn(); - // No batches yet. - assert_eq!( - sql_select_latest_batch_index(&conn).expect("query latest batch nonce"), - None - ); - - // Seed batch 0 / frame 0, then batch 1 / frame 0. - seed_open_batch0_frame0(&mut conn); - { - let tx = conn.transaction().expect("start tx"); - sql_insert_open_batch(&tx, 456).expect("insert batch 1"); - let next_batch = tx.last_insert_rowid(); - sql_insert_open_frame(&tx, next_batch, 0, 456, 3, 5) - .expect("insert frame 0 for batch 1"); - tx.commit().expect("commit tx"); - } - - let latest = sql_select_latest_batch_index(&conn) - .expect("query latest batch nonce") - .expect("latest batch should exist"); - assert_eq!(latest, 1); - - let frames = sql_select_frames_for_batch(&conn, 1).expect("query frames for batch 1"); - assert_eq!(frames.len(), 1); - let FrameHeaderRow { - frame_in_batch, - fee, - safe_block, - } = frames[0].clone(); - assert_eq!(frame_in_batch, 0); - assert_eq!(fee, 3); - assert_eq!(safe_block, 5); - } - - #[test] - fn user_ops_for_frame_helper_returns_ordered_rows() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - - // Insert two user-ops with different pos_in_frame values. - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 1_i64, - vec![0x10_u8; 20], - 0_i64, - 1_i64, - vec![0x01_u8], - vec![0x55_u8; 65], - 0_i64 - ], - ) - .expect("insert first user op"); - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 0_i64, - vec![0x20_u8; 20], - 1_i64, - 2_i64, - vec![0x02_u8], - vec![0x66_u8; 65], - 0_i64 - ], - ) - .expect("insert second user op"); - - let rows = sql_select_user_ops_for_frame(&conn, 0, 0).expect("query user ops for frame"); - assert_eq!(rows.len(), 2); - // Ordered by pos_in_frame ASC: nonce 1 comes from pos 1, then nonce 0 from pos 0. - assert_eq!(rows[0].nonce, 1); - assert_eq!(rows[1].nonce, 0); - } - - #[test] - fn open_batch_and_frame_insert_helpers_work() { - let mut conn = setup_conn(); - let tx = conn.transaction().expect("start tx"); - - sql_insert_open_batch(&tx, 123).expect("insert open batch"); - let new_batch = tx.last_insert_rowid(); - sql_insert_open_frame(&tx, new_batch, 0, 123, 7, 9).expect("insert open frame"); - tx.commit().expect("commit tx"); - - let batch_count: i64 = conn - .query_row("SELECT COUNT(*) FROM batches", [], |row| row.get(0)) - .expect("count batches"); - let frame_count: i64 = conn - .query_row("SELECT COUNT(*) FROM frames", [], |row| row.get(0)) - .expect("count frames"); - assert_eq!(batch_count, 1); - assert_eq!(frame_count, 1); - } - - #[test] - fn batch_policy_helpers_read_defaults_and_update_knobs() { - let conn = setup_conn(); - // Default: log_gas_price=0 → log_recommended_fee=0+20+419+621=1060 - // log_batch_size_target = 1403 - (-229) - 419 = 1213 - let (log_fee, log_target) = sql_select_batch_policy(&conn).expect("read policy"); - assert_eq!(log_fee, 20 + 419 + 621); // 1060 - assert_eq!(log_target, 1403 - (-229) - 419); // 1213 - - sql_update_batch_policy_log_gas_price(&conn, 100).expect("update log gas price"); - let (log_fee, _) = sql_select_batch_policy(&conn).expect("read updated policy"); - assert_eq!(log_fee, 100 + 20 + 419 + 621); // 1160 - - // Update alpha: num=200, denom=1000 → log_alpha=-207, log_one_plus_alpha=23 - // View derives: log_batch_size_target = 1403 - (-207) - 419 = 1191 - sql_update_batch_policy_alpha(&conn, -207, 23).expect("update alpha"); - let (log_fee, log_target) = sql_select_batch_policy(&conn).expect("read updated target"); - assert_eq!(log_target, 1403 - (-207) - 419); // 1191 - assert_eq!(log_fee, 100 + 23 + 419 + 621); // 1163 - } - - #[test] - fn batch_policy_check_rejects_unsafe_alpha() { - let conn = setup_conn(); - // log_alpha=-350 → log_batch_size_target = 1403-(-350)-419 = 1334 >= log_max_batch_bytes=1333 - let err = sql_update_batch_policy_alpha(&conn, -350, 0); - assert!( - err.is_err(), - "CHECK should reject unsafe alpha (log_batch_size_target >= log_max_batch_bytes)" - ); - } - - #[test] - fn l1_safe_head_helpers_read_and_update_singleton() { - let conn = setup_conn(); - assert_eq!(sql_select_safe_block(&conn).expect("read safe block"), 0); - sql_update_safe_block(&conn, 12).expect("update safe block"); - assert_eq!(sql_select_safe_block(&conn).expect("read updated"), 12); - } - - #[test] - fn batch_insert_helpers_insert_multiple_rows() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - let tx = conn.transaction().expect("start tx"); - - let safe_inputs = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xaa_u8], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xbb_u8], - block_number: 11, - }, - ]; - sql_insert_safe_inputs_batch(&tx, 0, safe_inputs.as_slice()) - .expect("insert direct inputs batch"); - - let user_ops = vec![ - sample_pending_user_op(0x20, 0, 1), - sample_pending_user_op(0x21, 1, 1), - ]; - sql_insert_user_ops_batch(&tx, 0, 0, 0, user_ops.as_slice()) - .expect("insert user ops + sequenced batch"); - - sql_insert_sequenced_direct_inputs( - &tx, - 0, - 0, - SafeInputRange::new(0, safe_inputs.len() as u64), - ) - .expect("insert sequenced direct inputs batch"); - - tx.commit().expect("commit tx"); - - let direct_inputs_count: i64 = conn - .query_row("SELECT COUNT(*) FROM safe_inputs", [], |row| row.get(0)) - .expect("count direct inputs"); - let user_ops_count: i64 = conn - .query_row("SELECT COUNT(*) FROM user_ops", [], |row| row.get(0)) - .expect("count user ops"); - let sequenced_count: i64 = conn - .query_row("SELECT COUNT(*) FROM sequenced_l2_txs", [], |row| { - row.get(0) - }) - .expect("count sequenced l2 txs"); - - assert_eq!(direct_inputs_count, 2); - assert_eq!(user_ops_count, 2); - assert_eq!(sequenced_count, 4); - } - - #[test] - fn user_op_uniqueness_is_sender_nonce() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - - // Same nonce with different senders should be accepted. - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 0_i64, - vec![0x11_u8; 20], - 0_i64, - 0_i64, - vec![0x01_u8], - vec![0x55_u8; 65], - 0_i64 - ], - ) - .expect("insert first user op"); - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 1_i64, - vec![0x22_u8; 20], - 0_i64, - 0_i64, - vec![0x02_u8], - vec![0x66_u8; 65], - 0_i64 - ], - ) - .expect("insert second user op with same nonce and different sender"); - - // Same sender + nonce should violate uniqueness. - let duplicate_sender_nonce = conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 2_i64, - vec![0x11_u8; 20], - 0_i64, - 0_i64, - vec![0x03_u8], - vec![0x77_u8; 65], - 0_i64 - ], - ); - assert!( - duplicate_sender_nonce.is_err(), - "duplicate (sender, nonce) should fail" - ); - } -} diff --git a/sequencer/src/storage/test_helpers.rs b/sequencer/src/storage/test_helpers.rs new file mode 100644 index 0000000..3ba67df --- /dev/null +++ b/sequencer/src/storage/test_helpers.rs @@ -0,0 +1,110 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared test fixtures used by `#[cfg(test)]` modules in `storage/`. + +use alloy_primitives::Address; +use sequencer_core::l2_tx::SequencedL2Tx; +use sequencer_core::protocol::ProtocolTiming; +use tempfile::TempDir; + +use super::{SafeInputRange, Storage, StoredSafeInput}; + +pub(crate) const SENDER_A: Address = Address::repeat_byte(0xAA); +pub(crate) const SENDER_B: Address = Address::repeat_byte(0xBB); + +/// Default protocol timing for tests that don't care about specific tuning. +/// Sender-independent (timing carries no address); pair with a submitter +/// address as needed at the call site. +pub(crate) fn default_protocol_timing() -> ProtocolTiming { + ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + } +} + +pub(crate) struct TestDb { + pub _dir: TempDir, + pub path: String, +} + +pub(crate) fn temp_db(name: &str) -> TestDb { + let dir = tempfile::Builder::new() + .prefix(format!("sequencer-{name}-").as_str()) + .tempdir() + .expect("create temporary test directory"); + let path = dir.path().join("sequencer.sqlite"); + TestDb { + _dir: dir, + path: path.to_string_lossy().into_owned(), + } +} + +/// Insert safe inputs whose payloads are SSZ-encoded batches with the given +/// nonces, all attributed to `sender`. `sender` doubles as the +/// batch-submitter address passed to `append_safe_inputs`, so the populated +/// `safe_accepted_batches` view matches this sender. +pub(crate) fn seed_safe_inputs_with_batch_nonces( + storage: &mut Storage, + sender: Address, + safe_block: u64, + nonces: &[u64], +) { + let inputs: Vec = nonces + .iter() + .map(|nonce| StoredSafeInput { + sender, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: *nonce, + frames: Vec::new(), + }), + block_number: safe_block, + }) + .collect(); + storage + .append_safe_inputs( + safe_block, + inputs.as_slice(), + sender, + &default_protocol_timing(), + ) + .expect("append safe inputs"); +} + +/// Create N closed batches (batch indices `0..count-1`) plus one open batch (index `count`). +pub(crate) fn seed_closed_batches(storage: &mut Storage, count: u64) { + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + for _ in 0..count { + let safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, safe_block) + .expect("close batch"); + } +} + +/// Pull every valid sequenced L2 tx out of storage, dropping the offset. +/// Test-only convenience around `ordered_l2_txs_page_from`. +pub(crate) fn all_ordered_l2_txs(storage: &mut Storage) -> Vec { + storage + .ordered_l2_txs_page_from(0, 1_000_000) + .expect("load all ordered l2 txs") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() +} + +/// SSZ-encoded single-frame batch payload at the given (nonce, safe_block). +pub(crate) fn make_stale_batch_payload(nonce: u64, safe_block: u64) -> Vec { + ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce, + frames: vec![sequencer_core::batch::Frame { + safe_block, + fee_price: 0, + user_ops: vec![], + }], + }) +} diff --git a/sequencer/tests/batch_submitter_integration.rs b/sequencer/tests/batch_submitter_integration.rs index 945ab7a..930a713 100644 --- a/sequencer/tests/batch_submitter_integration.rs +++ b/sequencer/tests/batch_submitter_integration.rs @@ -4,46 +4,93 @@ //! Integration tests for the batch submitter: worker loop with real storage and mock poster. use std::sync::Arc; +use std::sync::Mutex; use std::time::Duration; -use alloy_primitives::Address; use async_trait::async_trait; -use sequencer::batch_submitter::{BatchPoster, BatchPosterError, TxHash}; -use sequencer::batch_submitter::{BatchSubmitter, BatchSubmitterConfig}; -use sequencer::shutdown::ShutdownSignal; +use sequencer::l1::submitter::{BatchPoster, BatchPosterError, TxHash}; +use sequencer::l1::submitter::{BatchSubmitter, BatchSubmitterConfig}; +use sequencer::runtime::shutdown::ShutdownSignal; use sequencer::storage::{SafeInputRange, Storage}; use sequencer_core::batch::Batch; -use tempfile::TempDir; +use sequencer_core::protocol::ProtocolTiming; -const BATCH_SUBMITTER_ADDRESS: Address = Address::repeat_byte(0x11); +mod common; +use common::{TestDb, temp_db}; -/// Minimal mock for integration tests: records submissions. +/// Minimal mock for integration tests. +/// +/// Records submissions. Optionally delays each `submit_batches` call (to race +/// a concurrent writer against the submitter loop), and can fail a configurable +/// number of times before succeeding (to exercise the transient-error retry +/// path). struct TestMock { - submissions: std::sync::Mutex>, + submissions: Mutex>, + /// Per-call delay applied inside `submit_batches`. + submit_delay: Mutex, + /// Remaining `submit_batches` calls that should return a Provider error + /// before the real submission path runs. + fail_next_n_submits: Mutex, } impl TestMock { fn new() -> Arc { Arc::new(Self { - submissions: std::sync::Mutex::new(Vec::new()), + submissions: Mutex::new(Vec::new()), + submit_delay: Mutex::new(Duration::ZERO), + fail_next_n_submits: Mutex::new(0), }) } + fn submissions(&self) -> Vec<(u64, usize)> { self.submissions.lock().expect("lock").clone() } + + fn set_submit_delay(&self, delay: Duration) { + *self.submit_delay.lock().expect("lock") = delay; + } + + fn fail_next_n_submits(&self, n: u32) { + *self.fail_next_n_submits.lock().expect("lock") = n; + } } #[async_trait] impl BatchPoster for TestMock { - async fn submit_batch(&self, payload: Vec) -> Result { - let batch_index = ssz::Decode::from_ssz_bytes(payload.as_slice()) - .map(|b: Batch| b.nonce) - .unwrap_or(0); - self.submissions - .lock() - .expect("lock") - .push((batch_index, payload.len())); - Ok(TxHash::ZERO) + async fn submit_batches( + &self, + payloads: Vec>, + ) -> Result, BatchPosterError> { + // Transient-failure hook: consume one of the configured failures + // before anything else, so the tick outcome maps to `Transient` and + // the loop must sleep + retry. + { + let mut slot = self.fail_next_n_submits.lock().expect("lock"); + if *slot > 0 { + *slot -= 1; + return Err(BatchPosterError::Provider( + "injected transient failure".into(), + )); + } + } + + let delay = *self.submit_delay.lock().expect("lock"); + if !delay.is_zero() { + tokio::time::sleep(delay).await; + } + + let mut tx_hashes = Vec::with_capacity(payloads.len()); + for payload in payloads { + let batch_index = ssz::Decode::from_ssz_bytes(payload.as_slice()) + .map(|b: Batch| b.nonce) + .unwrap_or(0); + self.submissions + .lock() + .expect("lock") + .push((batch_index, payload.len())); + tx_hashes.push(TxHash::ZERO); + } + Ok(tx_hashes) } async fn observed_submitted_batch_nonces( @@ -60,20 +107,22 @@ impl BatchPoster for TestMock { } } -const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; - -fn temp_db(name: &str) -> (TempDir, String) { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-batch-submitter-it-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - (dir, path.to_string_lossy().into_owned()) +/// Mirrors what `run_preemptive_recovery` does in production: persist a real +/// safe-head observation so `submitter_frontier` has a row to read. Without +/// this, the submitter's first tick errors out on `current_safe_block_required` +/// and the loop exits before submitting anything. +fn seed_observed_safe_head(storage: &mut Storage) { + let timing = + ProtocolTiming::try_new(sequencer_core::MAX_WAIT_BLOCKS, 75, 900, 12).expect("test timing"); + storage + .append_safe_inputs(0, &[], alloy_primitives::Address::ZERO, &timing) + .expect("seed observed safe head"); } /// Seeds storage so batches 1 and 2 are closed and batch 3 is open. fn seed_two_closed_batches(db_path: &str) { - let mut storage = Storage::open(db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); + seed_observed_safe_head(&mut storage); let mut head = storage .initialize_open_state(0, SafeInputRange::empty_at(0)) .expect("initialize open state"); @@ -89,9 +138,35 @@ fn seed_two_closed_batches(db_path: &str) { .expect("close batch 2"); } +/// Seeds storage so batch 0 is closed and batch 1 is the open Tip. +fn seed_one_closed_batch(db_path: &str) { + let mut storage = Storage::open(db_path).expect("open storage"); + seed_observed_safe_head(&mut storage); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let next_safe = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 0"); +} + +/// Close the current open Tip so it becomes eligible for submission. +fn close_current_tip(db_path: &str) { + let mut storage = Storage::open(db_path).expect("open storage"); + let mut head = storage + .open_state() + .expect("load open state") + .expect("open Tip exists"); + let next_safe = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close current Tip"); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn submitter_loop_submits_closed_batches_then_exits_on_shutdown() { - let (_dir, path) = temp_db("loop-submits"); + let TestDb { _dir, path } = temp_db("loop-submits"); seed_two_closed_batches(&path); let mock = TestMock::new(); @@ -99,14 +174,10 @@ async fn submitter_loop_submits_closed_batches_then_exits_on_shutdown() { let config = BatchSubmitterConfig { idle_poll_interval_ms: 5000, }; - let submitter = BatchSubmitter::new( - path, - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - shutdown.clone(), - config, - ); - let handle = submitter.start().expect("start batch submitter"); + let submitter = BatchSubmitter::new(path, mock.clone(), config); + let handle = submitter + .start(shutdown.clone()) + .expect("start batch submitter"); // Allow at least one tick to run (worker may submit batch 1 and 2 in one tick). tokio::time::sleep(Duration::from_millis(200)).await; @@ -124,3 +195,116 @@ async fn submitter_loop_submits_closed_batches_then_exits_on_shutdown() { assert_eq!(submissions[1].0, 1, "second submission should be batch 1"); assert_eq!(submissions[2].0, 2, "third submission should be batch 2"); } + +// ── Loop cadence invariants ─────────────────────────────────────────────── +// +// These pin the behavior the two-worker refactor unlocked: +// - Submitted → re-enter IMMEDIATELY (no sleep). +// - Transient (Poster error) → log + sleep + retry (loop must NOT exit). +// +// Both are loop-level properties that aren't visible from `tick_once`. + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn submitter_re_enters_immediately_after_productive_tick() { + // Design of the test: + // + // t=0ms Submitter starts. Tick 1 loads batch 0, enters submit_batches + // which sleeps for `submit_delay` (400ms) before recording. + // t=100 A concurrent writer closes the Tip, making batch 1 eligible. + // t~400 submit_batches returns. Tick 1 outcome is Submitted(1). + // Loop must re-enter IMMEDIATELY (Submitted branch → `continue`). + // t~400 Tick 2 observes the new batch 1, submits it (another 400ms). + // t~800 submit_batches returns again, Submitted(1). + // t=1200 Test asserts: two submissions landed inside the window. + // + // If `Submitted → sleep idle_poll` ever regresses, tick 2 would wait 10s + // and the second submission would not appear in the 1.2s budget. + let TestDb { _dir, path } = temp_db("loop-immediate-retry"); + seed_one_closed_batch(&path); + + let mock = TestMock::new(); + mock.set_submit_delay(Duration::from_millis(400)); + let shutdown = ShutdownSignal::default(); + let config = BatchSubmitterConfig { + // Ten seconds — anything above ~2s would be enough to fail if the + // immediate-retry cadence regressed to always-sleep. + idle_poll_interval_ms: 10_000, + }; + let submitter = BatchSubmitter::new(path.clone(), mock.clone(), config); + let handle = submitter + .start(shutdown.clone()) + .expect("start batch submitter"); + + // Let tick 1 enter `submit_batches` (which is now blocking on the delay), + // then close the Tip so batch 1 is eligible by the time tick 2 runs. + tokio::time::sleep(Duration::from_millis(100)).await; + close_current_tip(&path); + + // Budget: ~2x the submit delay. With immediate-retry this is plenty. + tokio::time::sleep(Duration::from_millis(1100)).await; + + shutdown.request_shutdown(); + let _ = tokio::time::timeout(Duration::from_secs(2), handle).await; + + let submissions = mock.submissions(); + assert_eq!( + submissions.len(), + 2, + "Submitted-then-new-work must re-enter without sleeping idle_poll=10s; \ + got submissions {submissions:?}" + ); + assert_eq!(submissions[0].0, 0); + assert_eq!(submissions[1].0, 1); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn submitter_recovers_from_transient_poster_error_without_exiting() { + // Design of the test: + // + // t=0ms Submitter starts. Tick 1 calls submit_batches, which returns + // a Provider error (the first of N injected failures). + // t=0ms Loop maps Err(Poster) → TickOutcome::Transient → sleep idle_poll. + // t~80ms Tick 2 runs. submit_batches succeeds, batch 0 recorded. + // t=250ms Test asserts: exactly 1 submission AND loop is still alive. + // + // Regressions this catches: + // - Propagating Poster errors as fatal (loop would exit; handle would + // resolve with BatchSubmitterError before shutdown fires). + // - Forgetting the sleep on Transient (would work, but could busy-loop + // on a persistent error — not tested here, but the retry-count path + // documents the intended cadence). + let TestDb { _dir, path } = temp_db("loop-transient-retry"); + seed_one_closed_batch(&path); + + let mock = TestMock::new(); + mock.fail_next_n_submits(1); + let shutdown = ShutdownSignal::default(); + let config = BatchSubmitterConfig { + // Short poll interval so the retry sleep completes well within the + // test window. Still long enough that accidentally always-sleeping + // would delay the single submission past the assertion. + idle_poll_interval_ms: 50, + }; + let submitter = BatchSubmitter::new(path.clone(), mock.clone(), config); + let handle = submitter + .start(shutdown.clone()) + .expect("start batch submitter"); + + tokio::time::sleep(Duration::from_millis(250)).await; + + assert!( + !handle.is_finished(), + "loop must not exit on a transient Poster error — it should log and retry", + ); + + let submissions = mock.submissions(); + assert_eq!( + submissions.len(), + 1, + "transient failure followed by success should land exactly one submission; got {submissions:?}", + ); + assert_eq!(submissions[0].0, 0); + + shutdown.request_shutdown(); + let _ = tokio::time::timeout(Duration::from_secs(2), handle).await; +} diff --git a/sequencer/tests/chain_id_validation.rs b/sequencer/tests/chain_id_validation.rs new file mode 100644 index 0000000..1ce77aa --- /dev/null +++ b/sequencer/tests/chain_id_validation.rs @@ -0,0 +1,160 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! H7 regression: chain-id/deployment mismatch is caught early in bootstrap. +//! +//! The H7 hardening moved the live RPC chain-id check before deployment +//! identity writes and replaced `assert_eq!` with typed bootstrap errors. This +//! file locks two code paths where the check matters: +//! +//! - Identity path: L1 is unreachable but a deployment identity exists with +//! a different chain_id. Check fires before `InputReader::from_parts`. +//! - Positive control: with a matched chain_id, `ChainIdMismatch` does NOT +//! fire, so the check doesn't misfire on the happy path. +//! +//! The RPC path (L1 reachable, chain_id from `eth_chainId` mismatches) is +//! NOT covered here because `InputReader::new` needs a real InputBox contract +//! deployed at `config.app_address` before the chain-id check fires. That +//! setup only exists in the full rollups-e2e harness (after `just setup`) — +//! see `chain_id_mismatch_via_live_rpc_refuses_boot_test` there. + +use std::time::Duration; + +use alloy_primitives::{Address, address}; +use app_core::application::{WalletApp, WalletConfig}; +use clap::Parser; +use sequencer::RunConfig; +use sequencer::runtime::{BootstrapError, IdentityError, RunError}; +use tempfile::TempDir; + +// Anvil's default devnet private key #0. +const ANVIL_KEY: &str = "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80"; +const TEST_APP_ADDR: &str = "0x1111111111111111111111111111111111111111"; + +/// Verify that `anvil` is available. Panics with a clear message if not found. +fn require_anvil() { + assert!( + std::process::Command::new("anvil") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(), + "anvil not found on PATH — install Foundry (https://getfoundry.sh)" + ); +} + +fn build_config( + data_dir: &str, + eth_rpc_url: &str, + chain_id: u64, +) -> Result { + RunConfig::try_parse_from([ + "sequencer", + "--http-addr", + "127.0.0.1:0", + "--data-dir", + data_dir, + "--eth-rpc-url", + eth_rpc_url, + "--chain-id", + &chain_id.to_string(), + "--app-address", + TEST_APP_ADDR, + "--batch-submitter-private-key", + ANVIL_KEY, + ]) +} + +fn build_app() -> WalletApp { + WalletApp::new(WalletConfig::default()) +} + +// ── Deployment-identity fallback path ─────────────────────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn chain_id_mismatch_from_deployment_identity_returns_typed_error() { + // Scenario: L1 is unreachable, but a deployment identity exists from a + // previous successful run. The stored chain_id does NOT match the current + // config. The fallback arm must return a typed identity mismatch. + + let dir = TempDir::new().expect("tempdir"); + let data_dir = dir.path().to_str().unwrap(); + + // Pre-populate deployment identity with chain_id=31337. + let db_path = format!("{data_dir}/sequencer.db"); + { + let mut storage = sequencer::storage::Storage::open(&db_path).expect("open db for seed"); + storage + .load_or_insert_deployment_identity(sequencer::storage::DeploymentIdentity { + chain_id: 31_337, + app_address: address!("0x1111111111111111111111111111111111111111"), + input_box_address: Address::from_slice(&[0x22; 20]), + input_box_genesis_block: 100, + batch_submitter_address: address!("0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266"), + }) + .expect("seed deployment identity"); + } + + // Point the sequencer at an unreachable RPC (port 1, reliably refused) and + // a MISMATCHED chain_id=1. L1 is unreachable → identity-fallback path runs + // → stored chain_id (31337) mismatches config (1). + let config = build_config(data_dir, "http://127.0.0.1:1", 1).expect("parse config"); + + let result = tokio::time::timeout(Duration::from_secs(30), sequencer::run(build_app(), config)) + .await + .expect("run() must return quickly on mismatch"); + + match result { + Err(RunError::Bootstrap(BootstrapError::Identity(IdentityError::Mismatch { + fields, + stored, + expected, + }))) => { + assert_eq!(fields, "chain_id"); + assert_eq!(stored.chain_id, 31_337); + assert_eq!(expected.chain_id, 1); + } + other => panic!("expected IdentityError::Mismatch, got: {other:?}"), + } +} + +// ── Positive: matched chain_id does NOT trigger ChainIdMismatch ────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn chain_id_match_does_not_produce_mismatch_error() { + // Positive control: when chain_id matches, we should NOT get ChainIdMismatch. + // (The sequencer then tries to start the full stack. We don't care about + // that — a timeout counts as "didn't return ChainIdMismatch early", which + // is what we want to verify.) + require_anvil(); + + let anvil = alloy::node_bindings::Anvil::default().spawn(); + let rpc_url = anvil.endpoint(); + let dir = TempDir::new().expect("tempdir"); + let config = build_config(dir.path().to_str().unwrap(), &rpc_url, 31_337) + .expect("parse config with matching chain_id"); + + // Short timeout: if ChainIdMismatch is going to fire, it fires fast. + // A timeout means the check passed and the sequencer is running normally. + let result = + tokio::time::timeout(Duration::from_secs(3), sequencer::run(build_app(), config)).await; + + match result { + Err(_timeout) => {} // expected — sequencer is running + Ok(Err(RunError::Bootstrap(BootstrapError::ChainIdMismatch { rpc, config }))) => { + panic!( + "matched chain_id must not produce ChainIdMismatch, got rpc={rpc} config={config}" + ); + } + Ok(Err(other)) => { + // Some other error is fine — we only care that it's not ChainIdMismatch. + eprintln!( + "sequencer returned non-mismatch error (expected under test conditions): {other:?}" + ); + } + Ok(Ok(())) => { + panic!("sequencer should not complete run() in a short test window"); + } + } +} diff --git a/sequencer/tests/common/mod.rs b/sequencer/tests/common/mod.rs new file mode 100644 index 0000000..45b9afa --- /dev/null +++ b/sequencer/tests/common/mod.rs @@ -0,0 +1,27 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared fixtures for `sequencer/tests/*.rs` integration tests. +//! +//! Integration tests compile as separate crates and cannot reach the +//! `#[cfg(test)]` helpers inside `sequencer/src/`. This module keeps the same +//! `TestDb` shape so callers work identically on both sides. + +use tempfile::TempDir; + +pub struct TestDb { + pub _dir: TempDir, + pub path: String, +} + +pub fn temp_db(name: &str) -> TestDb { + let dir = tempfile::Builder::new() + .prefix(format!("sequencer-{name}-").as_str()) + .tempdir() + .expect("create temporary test directory"); + let path = dir.path().join("sequencer.sqlite"); + TestDb { + _dir: dir, + path: path.to_string_lossy().into_owned(), + } +} diff --git a/sequencer/tests/e2e_sequencer.rs b/sequencer/tests/e2e_sequencer.rs index 2869f2f..9b7bea7 100644 --- a/sequencer/tests/e2e_sequencer.rs +++ b/sequencer/tests/e2e_sequencer.rs @@ -12,23 +12,186 @@ use app_core::application::{ use futures_util::StreamExt; use k256::ecdsa::SigningKey; use k256::ecdsa::signature::hazmat::PrehashSigner; -use sequencer::api::{self, ApiConfig}; -use sequencer::inclusion_lane::{ +use sequencer::egress::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; +use sequencer::http::{self, ApiConfig}; +use sequencer::ingress::inclusion_lane::{ InclusionLane, InclusionLaneConfig, InclusionLaneError, PendingUserOp, }; -use sequencer::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; -use sequencer::shutdown::ShutdownSignal; +use sequencer::runtime::shutdown::ShutdownSignal; use sequencer::storage::{SafeInputRange, Storage, StoredSafeInput}; use sequencer_core::api::{TxRequest, TxResponse, WsTxMessage}; use sequencer_core::l2_tx::SequencedL2Tx; use sequencer_core::user_op::UserOp; use sequencer_rust_client::SequencerClient; -use tempfile::TempDir; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::mpsc; use tokio_tungstenite::connect_async; use tokio_tungstenite::tungstenite::Message; +mod common; +use common::temp_db; + +// ── V1 regression: cross-boundary signature domain consistency ──────── +// +// The sequencer signs user-ops with `sequencer_core::build_input_domain`. The +// scheduler (canonical-app) recovers senders with the same function. If the +// two sides ever drift (the V1 bug: scheduler had `name: None`, sequencer had +// `name: Some("CartesiAppSequencer")`), every signature recovers a different +// address on each side, structurally breaking the rollup. +// +// These tests lock the invariant at two levels: +// 1. A signature built via the shared constructor recovers the signer's +// address (positive). +// 2. A signature built with ANY domain that differs from the shared +// constructor recovers a DIFFERENT address (negative — proves the domain +// actually affects recovery). + +#[test] +fn v1_regression_shared_domain_recovers_signer() { + use alloy_sol_types::SolStruct; + + let signing_key = SigningKey::from_bytes((&[42_u8; 32]).into()).expect("signing key"); + let signer_address = address_from_signing_key(&signing_key); + + let chain_id = 31_337_u64; + let app = Address::from_slice(&[0xaa; 20]); + let domain = sequencer_core::build_input_domain(chain_id, app); + + let user_op = UserOp { + nonce: 0, + max_fee: 1_200, + data: vec![0x01, 0x02, 0x03].into(), + }; + + // Sign with the shared domain. + let hash = user_op.eip712_signing_hash(&domain); + let k256_sig = signing_key.sign_prehash(hash.as_slice()).expect("sign"); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|s| { + s.recover_address_from_prehash(&hash) + .ok() + .is_some_and(|r| r == signer_address) + }) + .expect("recoverable parity"); + + // Recover with the shared domain — must equal signer. + let hash_again = user_op.eip712_signing_hash(&domain); + let recovered = signature + .recover_address_from_prehash(&hash_again) + .expect("recover"); + assert_eq!( + recovered, signer_address, + "shared domain must recover signer" + ); +} + +#[test] +fn v1_regression_name_none_domain_recovers_different_address() { + use alloy_sol_types::{Eip712Domain, SolStruct}; + + let signing_key = SigningKey::from_bytes((&[42_u8; 32]).into()).expect("signing key"); + let signer_address = address_from_signing_key(&signing_key); + + let chain_id = 31_337_u64; + let app = Address::from_slice(&[0xaa; 20]); + let correct_domain = sequencer_core::build_input_domain(chain_id, app); + + // The exact buggy domain the scheduler used pre-V1 fix. + let buggy_domain = Eip712Domain { + name: None, + version: None, + chain_id: Some(U256::from(chain_id)), + verifying_contract: Some(app), + salt: None, + }; + + let user_op = UserOp { + nonce: 0, + max_fee: 1_200, + data: vec![0x01, 0x02, 0x03].into(), + }; + + // Sign with the correct (shared) domain. + let hash = user_op.eip712_signing_hash(&correct_domain); + let k256_sig = signing_key.sign_prehash(hash.as_slice()).expect("sign"); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|s| { + s.recover_address_from_prehash(&hash) + .ok() + .is_some_and(|r| r == signer_address) + }) + .expect("recoverable parity"); + + // Recover with the buggy domain — must NOT recover the signer. + // (This is what would silently fail at the scheduler under the V1 bug.) + let buggy_hash = user_op.eip712_signing_hash(&buggy_domain); + let recovered_under_buggy = signature + .recover_address_from_prehash(&buggy_hash) + .expect("recovery succeeds but returns the wrong address"); + assert_ne!( + recovered_under_buggy, signer_address, + "a name:None domain must not recover the signer — if this fails, \ + the shared domain constructor is bit-identical to the buggy one, \ + meaning the V1 fix regressed" + ); +} + +#[test] +fn v1_regression_domain_fields_all_affect_recovery() { + use alloy_sol_types::SolStruct; + + let signing_key = SigningKey::from_bytes((&[42_u8; 32]).into()).expect("signing key"); + let signer_address = address_from_signing_key(&signing_key); + + let app = Address::from_slice(&[0xaa; 20]); + let user_op = UserOp { + nonce: 0, + max_fee: 1_200, + data: vec![0x01].into(), + }; + + // Sign with chain_id = 1. + let chain_a = sequencer_core::build_input_domain(1, app); + let hash_a = user_op.eip712_signing_hash(&chain_a); + let k256_sig = signing_key.sign_prehash(hash_a.as_slice()).expect("sign"); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|s| { + s.recover_address_from_prehash(&hash_a) + .ok() + .is_some_and(|r| r == signer_address) + }) + .expect("recoverable parity"); + + // Cross-chain replay must fail: recover under chain_id=2 with the same app. + let chain_b = sequencer_core::build_input_domain(2, app); + let hash_b = user_op.eip712_signing_hash(&chain_b); + let recovered_b = signature + .recover_address_from_prehash(&hash_b) + .expect("recovery returns some address"); + assert_ne!( + recovered_b, signer_address, + "cross-chain replay must not recover signer" + ); + + // Cross-app replay must fail: recover under same chain but different app. + let other_app = Address::from_slice(&[0xbb; 20]); + let chain_a_app_other = sequencer_core::build_input_domain(1, other_app); + let hash_app_other = user_op.eip712_signing_hash(&chain_a_app_other); + let recovered_app_other = signature + .recover_address_from_prehash(&hash_app_other) + .expect("recovery returns some address"); + assert_ne!( + recovered_app_other, signer_address, + "cross-app replay must not recover signer" + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn e2e_submit_tx_ack_and_broadcast() { let db = temp_db("full-e2e"); @@ -54,7 +217,7 @@ async fn e2e_submit_tx_ack_and_broadcast() { // The deposit is broadcast first. let deposit_message = recv_ws_message(&mut ws).await; match deposit_message { - WsTxMessage::DirectInput { offset, .. } => assert_eq!(offset, 0), + WsTxMessage::DirectInput { offset, .. } => assert_eq!(offset, 1), other => panic!("expected deposit direct input as first WS message, got {other:?}"), } let method = Method::Withdrawal(Withdrawal { @@ -96,7 +259,7 @@ async fn e2e_submit_tx_ack_and_broadcast() { fee, data, } => { - assert_eq!(offset, 1); + assert_eq!(offset, 2); assert_eq!(ws_sender, sender.to_string()); // Frame fee is the default log_recommended_fee = 1060. assert_eq!(fee, 1060); @@ -228,6 +391,43 @@ async fn api_rejects_malformed_json_as_bad_request() { "expected bad-request error code, got: {body}" ); + // / H2 regression: the message must come from the fixed taxonomy + // ("invalid JSON"), NOT reflect serde's line/column/token excerpt. The + // malformed input contains the token `0x1234` — assert it doesn't appear + // in the response body so no attacker-submitted bytes are echoed. + assert!( + body.contains("\"message\":\"invalid JSON\""), + "expected fixed message 'invalid JSON' in body, got: {body}" + ); + assert!( + !body.contains("0x1234"), + "body must not reflect attacker-submitted input bytes, got: {body}" + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_missing_content_type_with_fixed_message() { + // / H2 regression: missing Content-Type must produce a fixed + // `"missing content type"` message, not reflect any part of the request. + let db = temp_db("missing-content-type"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server_with_max_body(db.path.as_str(), domain, 128 * 1024).await + else { + return; + }; + + // Valid JSON body, but sent without Content-Type: application/json. + let (status, body) = post_raw_body_no_content_type(runtime.addr, "{}").await; + assert_eq!(status, 400, "missing content-type: {body}"); + assert!( + body.contains("\"message\":\"missing content type\""), + "expected fixed 'missing content type' message, got: {body}" + ); + shutdown_runtime(runtime).await; } @@ -315,6 +515,451 @@ async fn api_rejects_user_op_payloads_above_application_limit() { shutdown_runtime(runtime).await; } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_json_with_missing_fields_using_fixed_envelope() { + // / H2 regression: a body that is valid JSON but missing required + // fields must respond with the fixed `"invalid JSON"` envelope. The + // response must not echo serde's deserialization error text — that would + // leak our internal field names and parser internals to callers. + let db = temp_db("missing-fields-json"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server_with_max_body(db.path.as_str(), domain, 128 * 1024).await + else { + return; + }; + + // Empty object — valid JSON, missing every required field. + let (status, body) = post_raw_json(runtime.addr, "{}").await; + assert_eq!(status, 400, "missing fields: {body}"); + + // Parse the response envelope and assert the message is exactly the fixed + // taxonomy string. Anything else implies serde leaked internals into the + // body — that's the regression this test pins. + let envelope: serde_json::Value = serde_json::from_str(&body).expect("response is JSON"); + let message = envelope + .get("message") + .and_then(|m| m.as_str()) + .expect("envelope has string `message` field"); + assert_eq!( + message, "invalid JSON", + "response message must be the fixed taxonomy string, got: {message:?} (full body: {body})", + ); + let code = envelope + .get("code") + .and_then(|c| c.as_str()) + .expect("envelope has string `code` field"); + assert_eq!(code, "BAD_REQUEST", "unexpected error code: {body}"); + + // Sanity: serde's typical leak vocabulary must not appear anywhere. + for needle in [ + "missing field", + "expected", + "deserializ", + "line ", + "column ", + ] { + assert!( + !body.contains(needle), + "potential serde leak — body contains {needle:?}: {body}", + ); + } + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_payload_size_check_fires_before_signature_recovery() { + // sharpening: oversized `data` must be rejected by + // `validate_payload_size` BEFORE any cryptographic work. We submit an + // oversized payload paired with a garbage-but-correctly-shaped signature: + // if the size check is enforced first, the response says "user op payload + // too large"; if signature recovery ran first the response would mention a + // signature/sender mismatch instead. Catches a regression that re-orders + // signature verification ahead of size validation, which would open a DoS + // vector (huge body × secp256k1 recovery cost). + let db = temp_db("size-before-sig"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain).await else { + return; + }; + + // Hand-craft a request: oversized data + correctly-shaped but garbage + // signature. The 65-byte signature passes `validate_hex_lengths`, so the + // next gate is `validate_payload_size`. If anyone moves signature recovery + // ahead of it, the response message changes and this assertion fails. + let oversized_data_hex = "00".repeat(MAX_METHOD_PAYLOAD_BYTES + 1); + let bogus_sig_hex = format!("0x{}", "00".repeat(65)); + let body = format!( + "{{\"message\":{{\"nonce\":0,\"max_fee\":0,\"data\":\"0x{oversized_data_hex}\"}},\ + \"signature\":\"{bogus_sig_hex}\",\ + \"sender\":\"0x0000000000000000000000000000000000000001\"}}", + ); + // Confirm the body fits under the default 4 KB body limit so we exercise + // the payload-size gate, not the upstream body-too-large gate. + assert!( + body.len() < 4 * 1024, + "test body must stay under default max_body_bytes (got {} bytes)", + body.len(), + ); + + let (status, response_body) = post_raw_json(runtime.addr, body.as_str()).await; + assert_eq!(status, 400, "oversized + bogus sig: {response_body}"); + assert!( + response_body.contains("user op payload too large"), + "size check must fire before signature verification — \ + expected 'user op payload too large' message, got: {response_body}", + ); + // Defensive: ensure the rejection is NOT a signature-class error. Any of + // these would mean signature recovery ran on the oversized payload. + for sig_marker in [ + "signature", + "sender mismatch", + "recover", + "INVALID_SIGNATURE", + ] { + assert!( + !response_body.contains(sig_marker), + "response mentions {sig_marker:?} — signature recovery may have run \ + before the size check: {response_body}", + ); + } + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_signature_with_invalid_parity_byte() { + // signature with correct length (65 bytes) but a parity byte + // outside the valid set (0/1 or 27/28) must be rejected at the crypto + // boundary with 422. Catches regressions where a new signature codec + // accepts arbitrary parity values and silently drifts recovery. + let db = temp_db("bad-parity-byte"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + // Correct-length signature (65 bytes) with a non-recoverable parity byte. + let mut bogus_sig = [0_u8; 65]; + bogus_sig[64] = 0xFF; + let bogus_sig_hex = format!("0x{}", alloy_primitives::hex::encode(bogus_sig)); + + let mut request = make_valid_request(&domain); + request.signature = bogus_sig_hex; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + // Observed contract: 400 with `INVALID_SIGNATURE` code, same as + // `forged_signature_rejected_test`. This test pins it. + assert_eq!( + status, 400, + "invalid parity byte must produce 400 (signature-class error), got {status}: {body}", + ); + assert!( + body.contains("INVALID_SIGNATURE"), + "expected INVALID_SIGNATURE code, got: {body}", + ); + // Defensive: make sure the rejection is from the signature layer, not the + // hex-length gate ( covers that) and not the payload-size gate. + assert!( + !body.contains("signature must be") && !body.contains("payload too large"), + "expected sig-recovery class error, not hex-length or size: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_sender_claim_that_mismatches_signature_recovery() { + // `sender` field in the request must equal the address recovered + // from the signature. A valid signature over a user-op paired with a + // different claimed `sender` must be rejected — can't accept someone + // else's signed op as if it came from ourselves. Complements the + // integration-level forged_signature_rejected_test (which asserts the + // end-to-end shape); this one pins the direct API response. + let db = temp_db("sender-mismatch-explicit"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + // Key A signs the user op; we claim the sender is address B. + let signing_key_a = SigningKey::from_bytes((&[1_u8; 32]).into()).expect("create signing key a"); + let signing_key_b = SigningKey::from_bytes((&[2_u8; 32]).into()).expect("create signing key b"); + let address_a = address_from_signing_key(&signing_key_a); + let address_b = address_from_signing_key(&signing_key_b); + assert_ne!(address_a, address_b, "test setup: A and B must differ"); + + let user_op = UserOp { + nonce: 0, + max_fee: TEST_MAX_FEE, + data: Vec::new().into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key_a), + sender: address_b.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + // Observed: 400 `INVALID_SIGNATURE` `"sender mismatch"` — same + // signature-class status as the parity-byte test above. + assert_eq!( + status, 400, + "sender-mismatch must produce 400 (signature-class error), got {status}: {body}", + ); + assert!( + body.contains("sender mismatch"), + "expected `sender mismatch` message, got: {body}", + ); + assert!( + body.contains("INVALID_SIGNATURE"), + "expected INVALID_SIGNATURE code, got: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_user_op_with_nonce_gap() { + // submitting a user-op with a nonce above the next expected one + // (i.e., a gap) must return 422 `InvalidNonce` and leave state + // unchanged. Complement to (nonce too low / replay) — together + // they pin the strict-equality requirement on `current_user_nonce`. + let db = temp_db("nonce-gap-too-high"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[7_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + bootstrap_open_frame_with_deposits(db.path.as_str(), &[(sender, U256::from(1_000_000_u64))]); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + // Current user nonce is 0 — a fresh sender has never submitted. Nonce 7 + // leaves a six-slot gap. + let user_op = UserOp { + nonce: 7, + max_fee: TEST_MAX_FEE, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + assert_eq!( + status, 422, + "nonce gap must produce 422, got {status}: {body}", + ); + assert!( + body.contains("nonce") || body.contains("NONCE"), + "expected nonce-class error, got: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_accepts_user_op_with_max_fee_equal_to_current_frame_fee() { + // boundary: the check is `max_fee >= current_frame_fee` (strict + // less-than rejects). An op with `max_fee == current_frame_fee` must be + // accepted. Pairs with (`fee_below_minimum_rejected_test`) — the + // two together pin the comparator. + let db = temp_db("fee-boundary-equal"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[9_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + // Fund with enough to cover gas at the frame fee. + bootstrap_open_frame_with_deposits(db.path.as_str(), &[(sender, U256::from(1_000_000_u64))]); + + // `bootstrap_open_frame` asserts frame_fee == 1060; use that exact value + // for the boundary case. + const FRAME_FEE_BOUNDARY: u16 = 1060; + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + let user_op = UserOp { + nonce: 0, + max_fee: FRAME_FEE_BOUNDARY, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + assert_eq!( + status, 200, + "max_fee == current_frame_fee boundary must be accepted (comparator is `<`, not `<=`), got {status}: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_user_op_when_balance_below_gas_cost() { + // if sender's balance < `fee_to_linear(current_frame_fee)` the + // user op must be rejected with 422 `InsufficientGasBalance` and leave + // state unchanged. Exercises the balance check in + // `WalletApp::validate_user_op` (app-core). A fresh sender with no + // deposits has balance 0, well below `fee_to_linear(1060)` (the + // bootstrapped frame fee). + let db = temp_db("insufficient-gas-balance"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[11_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + // No deposit for `sender` → balance = 0. + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + let user_op = UserOp { + nonce: 0, + max_fee: TEST_MAX_FEE, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + assert_eq!( + status, 422, + "insufficient-balance must produce 422, got {status}: {body}", + ); + assert!( + body.contains("insufficient balance for gas"), + "expected InsufficientGasBalance message, got: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_concurrent_same_nonce_leaves_exactly_one_committed() { + // two concurrent POSTs for the same (sender, nonce) — one + // succeeds, one is rejected with a nonce-class error. Pins the invariant + // that the rejected half does NOT leave any state artifact: the final + // balance/nonce must match the single-commit path. + let db = temp_db("concurrent-same-nonce"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[13_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + bootstrap_open_frame_with_deposits(db.path.as_str(), &[(sender, U256::from(10_000_000_u64))]); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let user_op = UserOp { + nonce: 0, + max_fee: TEST_MAX_FEE, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + let request_json = serde_json::to_string(&request).expect("serialize request"); + + // Two concurrent POSTs with byte-identical bodies. + let addr = runtime.addr; + let body_a = request_json.clone(); + let body_b = request_json; + let a = tokio::spawn(async move { post_raw_json(addr, body_a.as_str()).await }); + let b = tokio::spawn(async move { post_raw_json(addr, body_b.as_str()).await }); + let (res_a, res_b) = tokio::try_join!(a, b).expect("join concurrent posts"); + + let outcomes = [res_a, res_b]; + let accepted = outcomes.iter().filter(|(s, _)| *s == 200).count(); + let rejected_bodies: Vec<&String> = outcomes + .iter() + .filter_map(|(s, b)| (*s == 422).then_some(b)) + .collect(); + assert_eq!( + accepted, 1, + "exactly one concurrent submission must be accepted, outcomes: {outcomes:?}", + ); + assert_eq!( + rejected_bodies.len(), + 1, + "exactly one concurrent submission must be rejected with 422, outcomes: {outcomes:?}", + ); + let rejected_body = rejected_bodies[0]; + assert!( + rejected_body.contains("bad nonce") || rejected_body.contains("INVALID_NONCE"), + "rejected concurrent op should be nonce-class, got: {rejected_body}", + ); + + shutdown_runtime(runtime).await; +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn restart_replays_same_ordered_l2_tx_stream_from_db() { let db = temp_db("restart-replay-golden"); @@ -357,15 +1002,16 @@ async fn restart_replays_same_ordered_l2_tx_stream_from_db() { let second_live = recv_ws_message(&mut ws).await; drop(ws); - let expected = load_all_ordered_l2_txs(db.path.as_str()); + let expected = all_ordered_l2_txs(db.path.as_str()); assert_eq!( expected.len(), 3, "expected deposit, direct input, and user op" ); - assert_ws_message_matches_tx(deposit_live, &expected[0], 0); - assert_ws_message_matches_tx(first_live, &expected[1], 1); - assert_ws_message_matches_tx(second_live, &expected[2], 2); + // DB offsets (SQLite rowid) start at 1. + assert_ws_message_matches_tx(deposit_live, &expected[0], 1); + assert_ws_message_matches_tx(first_live, &expected[1], 2); + assert_ws_message_matches_tx(second_live, &expected[2], 3); shutdown_runtime(runtime).await; @@ -384,9 +1030,10 @@ async fn restart_replays_same_ordered_l2_tx_stream_from_db() { .expect("timeout connecting websocket after restart") .expect("connect websocket after restart"); - for (offset, expected_tx) in expected.iter().enumerate() { + for (i, expected_tx) in expected.iter().enumerate() { let replayed = recv_ws_message(&mut restarted_ws).await; - assert_ws_message_matches_tx(replayed, expected_tx, offset as u64); + // DB offsets start at 1. + assert_ws_message_matches_tx(replayed, expected_tx, (i + 1) as u64); } drop(restarted_ws); @@ -396,9 +1043,10 @@ async fn restart_replays_same_ordered_l2_tx_stream_from_db() { struct FullServerRuntime { addr: std::net::SocketAddr, shutdown: ShutdownSignal, - server_task: Option, - lane_handle: - Option>>, + server_task: Option, + lane_handle: Option< + tokio::task::JoinHandle>, + >, _parked_rx: Option>, } @@ -435,7 +1083,7 @@ async fn start_full_server_with_max_body( }; let addr = listener.local_addr().expect("read listener addr"); - let storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let storage = Storage::open(db_path).expect("open storage"); let shutdown = ShutdownSignal::default(); let (tx, lane_handle) = InclusionLane::start( @@ -449,6 +1097,7 @@ async fn start_full_server_with_max_body( safe_input_buffer_capacity: 32, max_batch_open: Duration::from_secs(60 * 60), idle_poll_interval: Duration::from_millis(2), + frontier_min_interval: Duration::ZERO, }, ); @@ -462,7 +1111,7 @@ async fn start_full_server_with_max_body( }, ); - let server_task = api::start_on_listener( + let server_task = http::start_on_listener( listener, tx, domain, @@ -500,7 +1149,7 @@ async fn start_api_only_server( }; let addr = listener.local_addr().expect("read listener addr"); - let _storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let _storage = Storage::open(db_path).expect("open storage"); let (tx, rx) = mpsc::channel::(queue_capacity); let shutdown = ShutdownSignal::default(); let tx_feed = L2TxFeed::new( @@ -512,7 +1161,7 @@ async fn start_api_only_server( batch_submitter_address: None, }, ); - let server_task = api::start_on_listener( + let server_task = http::start_on_listener( listener, tx, domain, @@ -567,28 +1216,40 @@ fn bootstrap_open_frame(db_path: &str) { /// Bootstrap open frame, optionally seeding ERC-20 deposits for the given senders. /// Each sender receives `amount` tokens before the frame is opened. fn bootstrap_open_frame_with_deposits(db_path: &str, deposits: &[(Address, U256)]) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let config = WalletConfig::default(); - if !deposits.is_empty() { - let safe_inputs: Vec = deposits - .iter() - .map(|(sender, amount)| { - let mut payload = Vec::with_capacity(72); - payload.extend_from_slice(config.supported_erc20_token.as_slice()); - payload.extend_from_slice(sender.as_slice()); - payload.extend_from_slice(amount.to_be_bytes::<32>().as_slice()); - StoredSafeInput { - sender: config.erc20_portal_address, - payload, - block_number: 1, - } - }) - .collect(); - storage - .append_safe_inputs(1, &safe_inputs) - .expect("seed deposits"); - } + // Always record a safe-head observation: production callers are gated by + // `run_preemptive_recovery`, so storage paths like `safe_input_frontier` + // assume a row exists. With no deposits we still write an empty advance + // so the lane can start without `current_safe_block_required` failing. + let safe_inputs: Vec = deposits + .iter() + .map(|(sender, amount)| { + let mut payload = Vec::with_capacity(72); + payload.extend_from_slice(config.supported_erc20_token.as_slice()); + payload.extend_from_slice(sender.as_slice()); + payload.extend_from_slice(amount.to_be_bytes::<32>().as_slice()); + StoredSafeInput { + sender: config.erc20_portal_address, + payload, + block_number: 1, + } + }) + .collect(); + storage + .append_safe_inputs( + 1, + &safe_inputs, + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, + ) + .expect("seed safe head (and any deposits)"); let safe_input_count = deposits.len() as u64; let leading_range = SafeInputRange::new(0, safe_input_count); @@ -622,7 +1283,7 @@ fn make_valid_request(domain: &Eip712Domain) -> TxRequest { } fn seed_safe_direct_input(db_path: &str, safe_block: u64, payload: Vec) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); storage .append_safe_inputs( safe_block, @@ -631,18 +1292,25 @@ fn seed_safe_direct_input(db_path: &str, safe_block: u64, payload: Vec) { payload, block_number: safe_block, }], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, ) .expect("append safe direct input"); } -fn load_all_ordered_l2_txs(db_path: &str) -> Vec { +fn all_ordered_l2_txs(db_path: &str) -> Vec { let mut storage = Storage::open_read_only(db_path).expect("open read-only storage"); - let total = storage - .ordered_l2_tx_count() - .expect("query ordered l2 tx count"); storage - .load_ordered_l2_txs_page_from(0, total as usize) + .ordered_l2_txs_page_from(0, 1_000_000) .expect("load ordered l2 txs") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() } fn assert_ws_message_matches_tx( @@ -696,6 +1364,30 @@ fn assert_ws_message_matches_tx( } } +async fn post_raw_body_no_content_type(addr: std::net::SocketAddr, body: &str) -> (u16, String) { + let host_port = addr.to_string(); + let mut stream = tokio::net::TcpStream::connect(host_port.as_str()) + .await + .expect("connect test http socket"); + // Deliberately omit Content-Type header. + let request = format!( + "POST /tx HTTP/1.1\r\nHost: {host_port}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{body}", + body.len() + ); + stream + .write_all(request.as_bytes()) + .await + .expect("write raw request"); + stream.flush().await.expect("flush raw request"); + + let mut response = Vec::new(); + stream + .read_to_end(&mut response) + .await + .expect("read raw response"); + parse_http_response(response.as_slice()) +} + async fn post_raw_json(addr: std::net::SocketAddr, body: &str) -> (u16, String) { let host_port = addr.to_string(); let mut stream = tokio::net::TcpStream::connect(host_port.as_str()) @@ -785,28 +1477,5 @@ fn decode_hex_prefixed(value: &str) -> Vec { } fn test_domain() -> Eip712Domain { - Eip712Domain { - name: Some("CartesiAppSequencer".to_string().into()), - version: Some("1".to_string().into()), - chain_id: Some(U256::from(1_u64)), - verifying_contract: Some(Address::from_slice(&[0_u8; 20])), - salt: None, - } -} - -struct TestDb { - _dir: TempDir, - path: String, -} - -fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-full-e2e-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } + sequencer_core::build_input_domain(1, Address::from_slice(&[0_u8; 20])) } diff --git a/sequencer/tests/ws_broadcaster.rs b/sequencer/tests/ws_broadcaster.rs index 5b25f4f..0aeaea3 100644 --- a/sequencer/tests/ws_broadcaster.rs +++ b/sequencer/tests/ws_broadcaster.rs @@ -8,20 +8,22 @@ use alloy_primitives::{Address, Signature}; use alloy_sol_types::Eip712Domain; use app_core::application::MAX_METHOD_PAYLOAD_BYTES; use futures_util::{SinkExt, StreamExt}; -use sequencer::api::{self, ApiConfig, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; -use sequencer::inclusion_lane::{PendingUserOp, SequencerError}; -use sequencer::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; -use sequencer::shutdown::ShutdownSignal; +use sequencer::egress::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; +use sequencer::http::{self, ApiConfig, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; +use sequencer::ingress::inclusion_lane::{PendingUserOp, SequencerError}; +use sequencer::runtime::shutdown::ShutdownSignal; use sequencer::storage::{SafeInputRange, Storage, StoredSafeInput}; use sequencer_core::api::WsTxMessage; use sequencer_core::l2_tx::SequencedL2Tx; use sequencer_core::user_op::{SignedUserOp, UserOp}; use sequencer_rust_client::SequencerClient; -use tempfile::TempDir; use tokio::sync::{mpsc, oneshot}; use tokio_tungstenite::connect_async; use tokio_tungstenite::tungstenite::Message; +mod common; +use common::temp_db; + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn ws_subscribe_streams_ordered_txs_from_offset_zero() { let db = temp_db("ws-subscribe-zero"); @@ -44,19 +46,21 @@ async fn ws_subscribe_streams_ordered_txs_from_offset_zero() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(first, &expected[0], 0); - assert_ws_message_matches_tx(second, &expected[1], 1); + // DB offsets (SQLite rowid) start at 1. + assert_ws_message_matches_tx(first, &expected[0], 1); + assert_ws_message_matches_tx(second, &expected[1], 2); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn ws_subscribe_resumes_from_given_offset() { let db = temp_db("ws-subscribe-resume"); seed_ordered_txs(db.path.as_str()); + // Resume from DB offset 1 — should get items with offset > 1. let expected = load_ordered_l2_txs_page(db.path.as_str(), 1, 1); assert_eq!( expected.len(), 1, - "resume snapshot must contain one event at offset 1" + "resume snapshot must contain one event at offset 2" ); let Some(runtime) = start_test_server(db.path.as_str()).await else { @@ -73,7 +77,7 @@ async fn ws_subscribe_resumes_from_given_offset() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(first, &expected[0], 1); + assert_ws_message_matches_tx(first, &expected[0], 2); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -105,7 +109,7 @@ async fn ws_subscribe_receives_live_events_after_subscribing() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(live, &expected[0], base_offset); + assert_ws_message_matches_tx(live, &expected[0], base_offset + 1); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -143,8 +147,8 @@ async fn ws_subscribe_fanout_delivers_live_event_to_multiple_subscribers() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(event_a, &expected[0], base_offset); - assert_ws_message_matches_tx(event_b, &expected[0], base_offset); + assert_ws_message_matches_tx(event_a, &expected[0], base_offset + 1); + assert_ws_message_matches_tx(event_b, &expected[0], base_offset + 1); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -265,8 +269,8 @@ async fn ws_subscribe_allows_catchup_exactly_at_limit() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(first, &expected[0], 0); - assert_ws_message_matches_tx(second, &expected[1], 1); + assert_ws_message_matches_tx(first, &expected[0], 1); + assert_ws_message_matches_tx(second, &expected[1], 2); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -304,7 +308,7 @@ async fn ws_subscribe_closes_on_oversized_inbound_message() { } fn seed_ordered_txs(db_path: &str) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage .initialize_open_state(0, SafeInputRange::empty_at(0)) .expect("initialize open state"); @@ -335,6 +339,13 @@ fn seed_ordered_txs(db_path: &str) { payload: vec![0xaa], block_number: 10, }], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, ) .expect("append direct input"); storage @@ -343,14 +354,15 @@ fn seed_ordered_txs(db_path: &str) { } fn append_drained_direct_input(db_path: &str, payload: Vec) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage - .load_open_state() + .open_state() .expect("load open state") .expect("open state should exist"); let safe_block = storage .current_safe_block() .expect("read current safe block") + .expect("safe head should have been observed") .saturating_add(1); let next_direct_index = storage .safe_input_end_exclusive() @@ -363,6 +375,13 @@ fn append_drained_direct_input(db_path: &str, payload: Vec) { payload, block_number: safe_block, }], + Address::ZERO, + &sequencer_core::protocol::ProtocolTiming { + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + l1_read_stale_after_blocks: 900, + seconds_per_block: 12, + }, ) .expect("append direct input"); storage @@ -377,7 +396,7 @@ fn append_drained_direct_input(db_path: &str, payload: Vec) { struct WsServerRuntime { addr: std::net::SocketAddr, shutdown: ShutdownSignal, - server_task: Option, + server_task: Option, } impl Drop for WsServerRuntime { @@ -421,7 +440,7 @@ async fn start_test_server_with_limits( batch_submitter_address: None, }, ); - let task = api::start_on_listener( + let task = http::start_on_listener( listener, tx_sender, Eip712Domain { @@ -507,15 +526,18 @@ fn ws_subscribe_url(addr: std::net::SocketAddr, from_offset: u64) -> String { fn ordered_l2_tx_count(db_path: &str) -> u64 { let mut storage = Storage::open_read_only(db_path).expect("open read-only storage"); storage - .ordered_l2_tx_count() - .expect("query ordered l2 count") + .ordered_l2_tx_head_offset() + .expect("query ordered l2 head offset") } fn load_ordered_l2_txs_page(db_path: &str, from_offset: u64, limit: usize) -> Vec { let mut storage = Storage::open_read_only(db_path).expect("open read-only storage"); storage - .load_ordered_l2_txs_page_from(from_offset, limit) + .ordered_l2_txs_page_from(from_offset, limit) .expect("load ordered l2 tx page") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() } fn assert_ws_message_matches_tx( @@ -566,20 +588,3 @@ fn assert_ws_message_matches_tx( } } } - -struct TestDb { - _dir: TempDir, - path: String, -} - -fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-ws-feed-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } -} diff --git a/tests/benchmarks/src/bin/report.rs b/tests/benchmarks/src/bin/report.rs index a380edd..3cfb8de 100644 --- a/tests/benchmarks/src/bin/report.rs +++ b/tests/benchmarks/src/bin/report.rs @@ -282,7 +282,7 @@ fn load_latest_multi_row_sweep(dir: &Path) -> Option Eip712Domain { - Eip712Domain { - name: Some(DOMAIN_NAME.to_string().into()), - version: Some(DOMAIN_VERSION.to_string().into()), - chain_id: Some(U256::from(self.chain_id)), - verifying_contract: Some(self.verifying_contract), - salt: None, - } + sequencer_core::build_input_domain(self.chain_id, self.verifying_contract) } } diff --git a/tests/benchmarks/src/lib.rs b/tests/benchmarks/src/lib.rs index 788322a..01d8423 100644 --- a/tests/benchmarks/src/lib.rs +++ b/tests/benchmarks/src/lib.rs @@ -16,8 +16,7 @@ mod workload; pub use ack::{AckRunConfig, AckRunReport, run_ack_benchmark}; pub use domain::{ - BenchmarkDomain, DEFAULT_ENDPOINT, DOMAIN_NAME, DOMAIN_VERSION, parse_address, - resolve_external_benchmark_domain, + BenchmarkDomain, DEFAULT_ENDPOINT, parse_address, resolve_external_benchmark_domain, }; pub use evaluation::{ ACK_P99_TARGET_MS, DIAGNOSTIC_P999_MIN_ACCEPTED_COUNT, NetworkProfile, NetworkProfileKind, @@ -34,6 +33,7 @@ pub use rt_sweep::{ RtSweepMeasurements, RtSweepRow, RtSweepRunReport, RtSweepSummary, compute_rt_sweep_summary, print_rt_sweep_report, write_csv as write_rt_sweep_csv, }; +pub use sequencer_core::{DOMAIN_NAME, DOMAIN_VERSION}; pub use stats::{ Stats, StatsMs, format_optional_f64, print_stats, rejection_rate, summarize, throughput_tx_per_s, diff --git a/tests/e2e/src/main.rs b/tests/e2e/src/main.rs index 69483fc..7b3c903 100644 --- a/tests/e2e/src/main.rs +++ b/tests/e2e/src/main.rs @@ -19,8 +19,20 @@ fn main() { ManagedSequencer::spawn(default_devnet_sequencer_config(log_prefix)) .await?; let scenario_result = scenario(&mut runtime).await; + // Post-test schema invariants: assert the DB's structural + // invariants only if the scenario succeeded — otherwise + // we'd mask the original failure with downstream + // weirdness. Checks the partial unique index, nonce + // contiguity, and FK validity directly against the DB + // file. + let invariant_result = if scenario_result.is_ok() { + runtime.assert_schema_invariants() + } else { + Ok(()) + }; let shutdown_result = runtime.shutdown().await; shutdown_result?; + invariant_result?; scenario_result }) }) diff --git a/tests/e2e/src/test_cases.rs b/tests/e2e/src/test_cases.rs index 827ba77..af3a41b 100644 --- a/tests/e2e/src/test_cases.rs +++ b/tests/e2e/src/test_cases.rs @@ -6,7 +6,8 @@ use std::time::Duration; use crate::{ScenarioFn, ScenarioResult}; use alloy_primitives::{Address, U256}; use rollups_harness::{ - ManagedSequencer, ReplayWalletApp, TestSigner, WalletL1Client, WsClient, sign_user_op_hex, + ManagedSequencer, ReplayWalletApp, RespawnAttemptOutcome, RespawnPolicy, TcpProxy, TestSigner, + WalletL1Client, WsClient, sign_user_op_hex, }; use sequencer_core::api::{TxRequest, WsTxMessage}; use sequencer_core::fee::fee_to_linear; @@ -21,6 +22,97 @@ const DEFAULT_FRAME_FEE: u16 = 1060; /// Max fee used for raw TxRequest construction. Must be >= DEFAULT_FRAME_FEE. const DEFAULT_MAX_FEE: u16 = 1200; +// ── Zone-math constants for the outage-matrix and recovery tests ───────── +// +// These derive from the sequencer's default config so a change to +// `MAX_WAIT_BLOCKS`, `SEQ_PREEMPTIVE_MARGIN_BLOCKS`, or `SEQ_SECONDS_PER_BLOCK` +// flows through here automatically. The compile-time asserts below catch any +// drift that would invalidate the zone framing of the tests (e.g., a per-retry +// advance that no longer crosses MAX_WAIT in the orchestrator loop). +// +// The picks (PRE / DANGER / PAST_STALE) are deliberately well inside their +// zones to give tests slack against scheduling jitter and timing drift. + +/// Source of truth: shared between sequencer + scheduler via +/// `sequencer_core::MAX_WAIT_BLOCKS`. +const MAX_WAIT_BLOCKS: u64 = sequencer_core::MAX_WAIT_BLOCKS; + +/// Default `SEQ_PREEMPTIVE_MARGIN_BLOCKS` from `runtime/config.rs`. The +/// harness spawn path does not override this flag, so the binary uses the +/// configured default. If the default changes, update here so +/// `DANGER_THRESHOLD_BLOCKS` stays aligned. +const DEFAULT_PREEMPTIVE_MARGIN_BLOCKS: u64 = 300; + +/// Default `SEQ_SECONDS_PER_BLOCK` from `runtime/config.rs`. The harness +/// `advance_wall_and_mine` also assumes this value internally. +const DEFAULT_SECONDS_PER_BLOCK: u64 = 12; + +/// Derived: the preemptive-recovery danger threshold. Below this we're safe; +/// above it (but below `MAX_WAIT_BLOCKS`) is the danger zone where the +/// sequencer triggers flush + shutdown but no cascade. +const DANGER_THRESHOLD_BLOCKS: u64 = MAX_WAIT_BLOCKS - DEFAULT_PREEMPTIVE_MARGIN_BLOCKS; + +/// Pre-danger pick — well below `DANGER_THRESHOLD_BLOCKS` so background drift +/// can't accidentally tip a test into the danger zone. +const PRE_DANGER_BLOCKS: u64 = 500; + +/// Danger-zone pick — comfortably past `DANGER_THRESHOLD_BLOCKS`, comfortably +/// below `MAX_WAIT_BLOCKS`. Used by tests that want "danger detected, no +/// cascade" framing. +const DANGER_ZONE_BLOCKS: u64 = 1150; + +/// Past-stale pick — comfortably past `MAX_WAIT_BLOCKS`. Startup recovery +/// must cascade at this point. +const PAST_STALE_BLOCKS: u64 = 1250; + +/// Per-retry L1 + wall-clock advance for `respawn_until_stable` loops that +/// start in the danger zone. The closed in-danger batch only cascades once +/// it ages past `MAX_WAIT_BLOCKS`, so each retry has to push the system +/// across that boundary within `RespawnPolicy::max_attempts`. The +/// compile-time check below pins the load-bearing relationship. +const RESPAWN_RETRY_ADVANCE_BLOCKS: u64 = 100; + +/// Convert a block count to wall-clock duration assuming the default block time. +const fn blocks_as_duration(blocks: u64) -> Duration { + Duration::from_secs(blocks * DEFAULT_SECONDS_PER_BLOCK) +} + +// Compile-time guards: drift in the constants above that breaks the test +// framing fails the build instead of failing tests at runtime. +const _: () = { + assert!( + DANGER_THRESHOLD_BLOCKS < MAX_WAIT_BLOCKS, + "danger threshold must precede the staleness boundary", + ); + assert!( + PRE_DANGER_BLOCKS < DANGER_THRESHOLD_BLOCKS, + "PRE_DANGER_BLOCKS must stay below DANGER_THRESHOLD_BLOCKS", + ); + assert!( + DANGER_ZONE_BLOCKS > DANGER_THRESHOLD_BLOCKS, + "DANGER_ZONE_BLOCKS must clear DANGER_THRESHOLD_BLOCKS", + ); + assert!( + DANGER_ZONE_BLOCKS < MAX_WAIT_BLOCKS, + "DANGER_ZONE_BLOCKS must stay below MAX_WAIT_BLOCKS (no premature cascade)", + ); + assert!( + PAST_STALE_BLOCKS > MAX_WAIT_BLOCKS, + "PAST_STALE_BLOCKS must exceed MAX_WAIT_BLOCKS (cascade must fire)", + ); + // Load-bearing for .x: starting from a closed + // in-danger batch, one retry advance must push it past MAX_WAIT_BLOCKS + // so cascade fires before max_attempts is exhausted. If + // `RESPAWN_RETRY_ADVANCE_BLOCKS` shrinks or `MAX_WAIT_BLOCKS` grows + // such that this no longer holds, tests would silently start failing + // by exhausting their retries — the compile-time check makes the + // breakage visible immediately. + assert!( + DANGER_ZONE_BLOCKS + RESPAWN_RETRY_ADVANCE_BLOCKS > MAX_WAIT_BLOCKS, + "RESPAWN_RETRY_ADVANCE_BLOCKS must cross MAX_WAIT from DANGER_ZONE in one retry", + ); +}; + struct ExpectedWalletState { address: Address, balance: U256, @@ -59,9 +151,160 @@ pub fn test_cases() -> Vec<(&'static str, ScenarioFn)> { ("multi_deposit_same_block_test", |runtime| { Box::pin(run_multi_deposit_same_block_test(runtime)) }), - ("shutdown_during_inflight_test", |runtime| { - Box::pin(run_shutdown_during_inflight_test(runtime)) + ( + "restart_after_committed_tx_replays_cleanly_test", + |runtime| Box::pin(run_restart_after_committed_tx_replays_cleanly_test(runtime)), + ), + ("recovery_after_stale_batches_test", |runtime| { + Box::pin(run_recovery_after_stale_batches_test(runtime)) }), + ("sequencer_outage_pre_danger_no_recovery_test", |runtime| { + Box::pin(run_sequencer_outage_pre_danger_no_recovery_test(runtime)) + }), + ("sequencer_outage_danger_zone_tip_cascade_test", |runtime| { + Box::pin(run_sequencer_outage_danger_zone_tip_cascade_test(runtime)) + }), + ("provider_outage_past_stale_cascades_test", |runtime| { + Box::pin(run_provider_outage_past_stale_cascades_test(runtime)) + }), + ("provider_outage_wall_clock_refuses_boot_test", |runtime| { + Box::pin(run_provider_outage_wall_clock_refuses_boot_test(runtime)) + }), + ("wall_clock_backward_jump_no_panic_test", |runtime| { + Box::pin(run_wall_clock_backward_jump_no_panic_test(runtime)) + }), + ("stalled_safe_head_startup_refuses_boot_test", |runtime| { + Box::pin(run_stalled_safe_head_startup_refuses_boot_test(runtime)) + }), + ( + "provider_outage_pre_danger_sequencer_continues_test", + |runtime| { + Box::pin(run_provider_outage_pre_danger_sequencer_continues_test( + runtime, + )) + }, + ), + ( + "provider_outage_danger_zone_sequencer_self_exits_test", + |runtime| { + Box::pin(run_provider_outage_danger_zone_sequencer_self_exits_test( + runtime, + )) + }, + ), + ("provider_outage_short_hiccup_no_recovery_test", |runtime| { + Box::pin(run_provider_outage_short_hiccup_no_recovery_test(runtime)) + }), + ( + "both_down_danger_zone_sequencer_first_refuses_boot_test", + |runtime| { + Box::pin(run_both_down_danger_zone_sequencer_first_refuses_boot_test( + runtime, + )) + }, + ), + ( + "both_down_danger_zone_proxy_first_restart_cycle_recovers_test", + |runtime| { + Box::pin(run_both_down_danger_zone_proxy_first_restart_cycle_recovers_test(runtime)) + }, + ), + ( + "sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test", + |runtime| { + Box::pin( + run_sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test(runtime), + ) + }, + ), + ( + "provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test", + |runtime| { + Box::pin( + run_provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test( + runtime, + ), + ) + }, + ), + ( + "first_boot_l1_unreachable_never_synced_refuses_boot_test", + |runtime| { + Box::pin(run_first_boot_l1_unreachable_never_synced_refuses_boot_test(runtime)) + }, + ), + ("delayed_inclusion_cascades_on_restart_test", |runtime| { + Box::pin(run_delayed_inclusion_cascades_on_restart_test(runtime)) + }), + ("aging_open_tip_runtime_danger_zone_exit_test", |runtime| { + Box::pin(run_aging_open_tip_runtime_danger_zone_exit_test(runtime)) + }), + ("stalled_safe_head_live_exit_test", |runtime| { + Box::pin(run_stalled_safe_head_live_exit_test(runtime)) + }), + ( + "ws_reconnect_at_invalidated_offset_skips_cleanly_test", + |runtime| { + Box::pin(run_ws_reconnect_at_invalidated_offset_skips_cleanly_test( + runtime, + )) + }, + ), + ( + "ws_subscribe_from_future_offset_waits_silently_test", + |runtime| { + Box::pin(run_ws_subscribe_from_future_offset_waits_silently_test( + runtime, + )) + }, + ), + ( + "recovery_drains_safe_but_undrained_direct_input_test", + |runtime| { + Box::pin(run_recovery_drains_safe_but_undrained_direct_input_test( + runtime, + )) + }, + ), + ( + "recovery_batch_opens_empty_when_no_direct_inputs_pending_test", + |runtime| { + Box::pin(run_recovery_batch_opens_empty_when_no_direct_inputs_pending_test(runtime)) + }, + ), + ("replay_matches_live_for_mixed_workload_test", |runtime| { + Box::pin(run_replay_matches_live_for_mixed_workload_test(runtime)) + }), + ( + "provider_outage_input_reader_retries_after_reconnect_test", + |runtime| { + Box::pin(run_provider_outage_input_reader_retries_after_reconnect_test(runtime)) + }, + ), + ( + "first_boot_no_identity_l1_unreachable_refuses_boot_test", + |runtime| { + Box::pin(run_first_boot_no_identity_l1_unreachable_refuses_boot_test( + runtime, + )) + }, + ), + ( + "chain_id_mismatch_via_live_rpc_refuses_boot_test", + |runtime| { + Box::pin(run_chain_id_mismatch_via_live_rpc_refuses_boot_test( + runtime, + )) + }, + ), + ( + "nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test", + |runtime| { + Box::pin( + run_nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test(runtime), + ) + }, + ), ] } @@ -241,7 +484,9 @@ async fn run_reconnect_from_offset_test(runtime: &mut ManagedSequencer) -> Scena let deposit_message = apply_safe_supported_deposit(runtime, &mut ws, &mut replay, &alice_l1, deposit_amount) .await?; - let reconnect_offset = deposit_message.offset().saturating_add(1); + // WS replay is cursor-based and exclusive: `from_offset` means + // "start after this already-consumed DB offset". + let reconnect_offset = deposit_message.offset(); drop(ws); alice_l2.transfer(bob_address, transfer_amount).await?; @@ -583,7 +828,15 @@ async fn run_multi_deposit_same_block_test(runtime: &mut ManagedSequencer) -> Sc Ok(()) } -async fn run_shutdown_during_inflight_test(runtime: &mut ManagedSequencer) -> ScenarioResult<()> { +// Restart after a committed tx and verify replay stays consistent. +// +// This is intentionally not an "in-flight request during shutdown" test: +// `WalletL2Client::transfer()` awaits the HTTP ack, so by the time restart +// happens the user-op is already durable. What this locks down is the +// committed-tx replay path across restart. +async fn run_restart_after_committed_tx_replays_cleanly_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { let alice = TestSigner::from_default(1)?; let alice_address = alice.address(); @@ -628,14 +881,2475 @@ async fn run_shutdown_during_inflight_test(runtime: &mut ManagedSequencer) -> Sc Ok(()) } -fn eip712_domain(runtime: &ManagedSequencer) -> alloy_sol_types::Eip712Domain { - alloy_sol_types::Eip712Domain { - name: Some("CartesiAppSequencer".to_string().into()), - version: Some("1".to_string().into()), - chain_id: Some(U256::from(runtime.domain_chain_id())), - verifying_contract: Some(runtime.verifying_contract()), - salt: None, - } +async fn run_recovery_after_stale_batches_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + let post_recovery_transfer = U256::from(200_000_u64); + let gas = fee_to_linear(DEFAULT_FRAME_FEE); + + // Step 1: Fund Alice via L1 deposit. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + + // Step 2: Alice transfers to Bob (this will be lost after recovery). + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Verify pre-recovery state. + assert_eq!( + replay_before.current_user_balance(alice_address), + deposit_amount - transfer_amount - gas, + ); + assert_eq!( + replay_before.current_user_balance(bob_address), + transfer_amount, + ); + + // Step 3: Kill the sequencer (Anvil stays up). + drop(ws); + runtime.stop().await?; + + // Step 4: Simulate ~4h of outage: advance both L1 and wall clock by + // MAX_WAIT_BLOCKS * SECONDS_PER_BLOCK = 1200 * 12 = 14400s. On respawn, + // l1_safe_head will be >1200 blocks past the frames' safe_block. + runtime + .advance_wall_and_mine(blocks_as_duration(MAX_WAIT_BLOCKS)) + .await?; + + // Step 5: Respawn the sequencer. Startup recovery should detect staleness. + runtime.respawn().await?; + + // Step 6: Replay from offset 0 after recovery. + // The deposit should be re-drained into the recovery batch. + // The transfer should be GONE (it was in an invalidated batch). + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + + // Expect the re-drained deposit. + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + + // No more messages — the transfer was invalidated. + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + // Alice should have her full deposit back (no transfer deducted). + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "after recovery, Alice should have full deposit (transfer was invalidated)" + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "after recovery, Bob should have zero (transfer was invalidated)" + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + // Step 8: Verify new work succeeds after recovery. + let mut alice_l2_fresh = runtime.wallet_l2(alice)?; + alice_l2_fresh + .transfer(bob_address, post_recovery_transfer) + .await?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount - post_recovery_transfer - gas, + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + post_recovery_transfer, + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 1); + + Ok(()) +} + +// ── Sequencer outage, pre-danger zone ──────────────────────────── +// +// Sequencer stops with an open batch (deposit + transfer); L1 advances 500 +// blocks (well below the danger threshold of 900). On restart: +// - Startup recovery runs but finds no danger zone → no flush. +// - No batches are stale → no cascade invalidation. +// - The deposit and transfer persist across the restart. +// - New txs succeed against the unchanged state. +// +// This is the positive control for the recovery procedure: it must NOT fire +// (or over-fire) when L1 hasn't drifted enough to cause trouble. + +async fn run_sequencer_outage_pre_danger_no_recovery_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pick an advance safely below the danger threshold + // (MAX_WAIT_BLOCKS 1200 - default margin 300 = 900). + const PRE_DANGER: Duration = blocks_as_duration(PRE_DANGER_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + let gas = fee_to_linear(DEFAULT_FRAME_FEE); + + // Step 1: Fund Alice and record a transfer. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + let expected_alice_balance = deposit_amount - transfer_amount - gas; + let expected_bob_balance = transfer_amount; + + // Step 2: Stop the sequencer. Leave Anvil running. + drop(ws); + runtime.stop().await?; + + // Step 3: Advance L1 + wall-clock a pre-danger amount (500 blocks ≈ 100min + // < 900 block danger threshold). + runtime.advance_wall_and_mine(PRE_DANGER).await?; + + // Step 4: Restart. No recovery should fire. + runtime.respawn().await?; + + // Step 5: Replay via WS from offset 0. Both the deposit and transfer must + // still be present (no invalidation). + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + expected_alice_balance, + "pre-danger restart must preserve Alice's balance", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + expected_bob_balance, + "pre-danger restart must preserve Bob's balance", + ); + assert_eq!( + replay_after.current_user_nonce(alice_address), + 1, + "Alice's nonce must NOT be reset", + ); + + // Step 6: No further messages queued. Confirm nothing else comes through. + // (A follow-up "new work succeeds" step is omitted here because the + // harness's `wallet_l2` initializes its local nonce counter at 0, and + // this scenario explicitly does NOT reset the on-chain nonce — the + // post-restart nonce is 1. Adding a "submit at nonce 1" check would + // require harness plumbing beyond the scope of this regression test.) + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + Ok(()) +} + +// ── Sequencer outage, danger zone (Tip cascade) ─────────────────── +// +// Sequencer stops; L1 advances into the danger zone (past `danger_threshold`) +// but strictly below `MAX_WAIT_BLOCKS`. On restart: +// - `check_danger` returns `TipInDanger(idx)` — the closed-frontier check finds +// nothing past gold, but the open Tip's first frame has aged past +// `danger_threshold`. +// - `decide_startup_action` returns `RecoverTip` (no flush — the Tip has +// no L1 footprint). +// - `recover_aging_tip` cascades the Tip; pre-outage soft-confirmed user +// ops are rolled back (this is the documented "soft confirmations may +// be invalidated under recovery" semantics). +// +// This exercises the Tip-recovery-without-flush path specifically. Earlier +// behavior used `MAX_WAIT_BLOCKS` for the Tip threshold and would have +// preserved the user op until current_safe_block crossed `MAX_WAIT_BLOCKS`; +// the policy is now to invalidate at `danger_threshold` so the system +// stops issuing pre-confirmations on a Tip that's already operationally +// suspect. + +async fn run_sequencer_outage_danger_zone_tip_cascade_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pick advance in the danger zone: > danger_threshold (900) but < MAX_WAIT (1200). + // Decoupled from wall clock on purpose: this test exercises the + // block-based danger check in isolation. Uses module-level + // `DANGER_ZONE_BLOCKS` (see top-of-file zone constants). + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + let gas = fee_to_linear(DEFAULT_FRAME_FEE); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + let expected_alice_balance = deposit_amount - transfer_amount - gas; + let expected_bob_balance = transfer_amount; + + drop(ws); + runtime.stop().await?; + + // L1 advances into the danger zone but strictly below the staleness + // threshold. The Tip's first frame has aged past `danger_threshold`, + // so the startup `RecoverTip` path cascades it (no flush — the Tip + // has no L1 footprint). Alice's pre-outage transfer was a soft + // confirmation against the Tip; it's rolled back. + runtime.mine_l1_blocks(DANGER_ZONE_BLOCKS).await?; + let _ = expected_alice_balance; + let _ = expected_bob_balance; + + runtime.respawn().await?; + + // After Tip cascade: balances roll back to the post-deposit / pre-transfer + // state, nonces reset, and the WS feed should not replay the invalidated + // user op. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "Tip cascade must roll Alice back to her deposit (transfer was \ + soft-confirmed against the cascaded Tip)", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob never received the rolled-back transfer", + ); + assert_eq!( + replay_after.current_user_nonce(alice_address), + 0, + "nonce must reset when Tip cascade rolls back the user op", + ); + + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + Ok(()) +} + +// ── Provider outage, past-stale (recovery through proxy) ────────── +// +// Scenario: the sequencer is routed through a `TcpProxy`, simulating a +// gateway in front of the real L1 node. While the sequencer is stopped, +// a temporary outage happens (proxy disconnected), L1 advances past the +// staleness threshold, and the outage ends (proxy reconnected). The next +// sequencer restart connects via the proxy, sees the advanced safe head, +// and cascade-invalidates the stale open batch. +// +// What this locks down that the sequencer-outage tests don't: +// - The proxy is actually wired into the RPC path. Subsequent RPC calls +// from the sequencer (safe-head sync, batch submission) route through +// it. If `set_l1_endpoint_override` ever regressed (e.g., respawn +// ignored the override), this test would fail. +// - Recovery over a non-direct connection works end-to-end. +// +// Note on wall-clock fallback: in principle this scenario would also test +// the fallback refusing to boot when L1 is unreachable AND real time has +// elapsed past the danger threshold. In practice, `anvil_mine(N)` takes +// milliseconds of real wall-clock time, so the fallback correctly reports +// "not yet in danger by wall-clock" and lets the sequencer boot with stale +// data. Exercising the wall-clock-refuses-to-boot path requires either +// direct `synced_at_ms` DB manipulation or a time-skew tool — deferred. + +async fn run_provider_outage_past_stale_cascades_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Advance comfortably past staleness so the test is robust to small + // scheduling drifts. + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Normal setup — deposit + transfer (the transfer will be lost). + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Step 2: Stop the sequencer and insert a proxy into the L1 path. + drop(ws); + runtime.stop().await?; + + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + + // Step 3: Simulate a gateway outage that spans the staleness window. + // - Disconnect the proxy (gateway is down). + // - Mine 1250 blocks directly on Anvil (bypasses the proxy). + // - Reconnect the proxy (gateway is back). + // During the outage the sequencer is stopped; when it comes back up, + // it will see the advanced safe head through the proxy. + proxy.disconnect(); + runtime.advance_wall_and_mine(PAST_STALE).await?; + proxy.reconnect(); + + // Step 4: Respawn. The sequencer dials the proxy, the proxy forwards + // to Anvil, `sync_to_current_safe_head` returns 1250+ blocks past the + // open Tip's first frame. `check_danger` fires `TipInDanger(idx)`, + // `decide_startup_action` returns `RecoverTip`, `recover_aging_tip` + // cascades the Tip and opens a fresh one. + runtime.respawn().await?; + + // Step 5: Verify via WS replay. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "transfer must be invalidated after past-stale outage routed through proxy", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob's receiving balance must be rolled back", + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + // Step 6: Tear down the proxy cleanly. + proxy.shutdown().await?; + + Ok(()) +} + +// ── Wall-clock fallback refuses to boot past danger threshold ───── +// +// Scenario: L1 is unreachable AND wall-clock time has elapsed past the +// danger threshold since the last successful L1 sync. The sequencer must +// refuse to boot — proceeding would mean issuing soft confirmations against +// stale L1 state, potentially missing that batches are already doomed. +// +// This test only became possible after the `find_first_batch_in_danger` +// unification. Prior to that fix, an open batch was invisible to +// `check_danger_zone`, so the wall-clock fallback could "miss" an open +// batch aging into danger while L1 was unreachable and boot anyway. +// +// The wall-clock illusion is created without OS tooling: `rewind_synced_at_ms` +// rewrites `l1_safe_head.synced_at_ms` to an older timestamp, equivalent +// to advancing the wall clock from the sequencer's perspective. We mine +// an equivalent number of blocks on Anvil to keep the block-time coupling +// documented in `docs/threat-model/README.md`. + +async fn run_provider_outage_wall_clock_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pick an elapsed time comfortably past the danger threshold. Defaults: + // seconds_per_block=12, danger_threshold=MAX_WAIT_BLOCKS(1200)-margin(300)=900. + // We need elapsed_secs / 12 > 900 → elapsed_secs > 10800. Use 5h. + const OUTAGE: Duration = Duration::from_secs(5 * 60 * 60); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Normal setup — deposit + transfer (transfer will be lost). + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Step 2: Stop the sequencer, insert proxy, disconnect it, advance both + // the wall clock and L1 by the outage duration — block-time coupled so + // the sequencer sees a consistent view (5h ≈ 1500 blocks at 12s/block). + drop(ws); + runtime.stop().await?; + + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + runtime.advance_wall_and_mine(OUTAGE).await?; + + // Step 3: Attempt respawn with proxy disconnected. The sequencer: + // - dials the proxy → sync_to_current_safe_head fails (L1 unreachable). + // - sees the persisted safe block timestamp is older than the L1 + // read-staleness threshold. + // - decide_startup_action returns Refuse(L1ViewStale) → process exits with failure. + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "respawn must fail: wall-clock says past-danger AND open batch is in danger", + ); + + // Step 4: Reconnect the proxy and respawn normally. Sync now succeeds, + // the stale open batch is cascade-invalidated, recovery batch opens. + proxy.reconnect(); + runtime.respawn().await?; + + // Step 5: Verify the invalidation: only the re-drained deposit appears. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "transfer must be invalidated after wall-clock-triggered recovery", + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO,); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + proxy.shutdown().await?; + Ok(()) +} + +// `SystemTime::now()` backward jump → `saturating_sub` handles +// cleanly, no panic. +// +// Scenario: normal setup creates DB state at real time T. Stop, disconnect +// proxy, backward-jump the clock via faketime, respawn with L1 unreachable. +// The wall-clock fallback runs: +// +// elapsed = now(T-1h).saturating_sub(last_sync_at_ms(≈T)) = 0 +// +// No danger → boot proceeds. After reconnect, normal operation resumes. +// If `saturating_sub` ever regresses to a plain subtraction (underflow +// panic on u64), this test panics at respawn. +async fn run_wall_clock_backward_jump_no_panic_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut ws = runtime.ws(0).await?; + let mut replay_before = ReplayWalletApp::devnet(); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(100_000_u64), + ) + .await?; + drop(ws); + + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + runtime.set_faketime_offset(Some("-1h".to_string()))?; + + // Respawn must NOT panic. With L1 unreachable, the wall-clock fallback + // is the only path that sees `now - last_sync_ms` — if the subtraction + // ever became non-saturating, this call would panic via u64 underflow. + runtime.respawn().await?; + + // Clean up: reconnect and let the sequencer catch up normally. + proxy.reconnect(); + // Clear the offset for subsequent respawns (not used here, but keeps the + // teardown deterministic if future cleanup code respawns). + runtime.set_faketime_offset(None)?; + + proxy.shutdown().await?; + Ok(()) +} + +// Provider reachable, safe head frozen, startup refuses to boot. +// +// Tests the stale-L1-view startup arm in isolation: when L1 is +// reachable but the safe head hasn't advanced since the last observation, +// the safe block timestamp crosses the read-staleness threshold and startup +// refuses. +// +// Scenario: +// 1. Set up baseline state (deposit + transfer in the open Tip). The Tip +// stays well below `danger_threshold` — we don't pre-age it, because +// the runtime detector now exits on `DangerStatus::TipInDanger` and we want +// a clean stop. +// 2. Stop the sequencer. +// 3. Advance only the sequencer's wall clock (no mining → safe head +// frozen). The offset must exceed `danger_threshold * SECONDS_PER_BLOCK` +// so the wall-clock-adjusted threshold saturates to 0 and catches even +// a fresh Tip. +// 4. Respawn → `check_danger` fires `L1ViewStale` → `Refuse(L1ViewStale)`. +// 5. Mine one L1 block and clear the faketime offset; safe-head progress +// resumes, the timestamp refreshes on sync, and the sequencer stays up. +async fn run_stalled_safe_head_startup_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // 4 hours: comfortably past `danger_threshold * SECONDS_PER_BLOCK` + // (900 blocks * 12 s = 10800 s = 3 h), so missed_blocks > danger_threshold, + // adjusted threshold saturates to 0, and the wall-clock arm catches even + // a fresh Tip. + const STALLED_SAFE_HEAD_OFFSET: &str = "+14400s"; + const SAFE_HEAD_SYNC_WINDOW: Duration = Duration::from_secs(8); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + runtime.stop().await?; + runtime.set_faketime_offset(Some(STALLED_SAFE_HEAD_OFFSET.to_string()))?; + + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "startup must refuse when L1 is reachable but the safe head stayed frozen \ + long enough that the L1 view is stale", + ); + + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "startup refusal on a reachable-but-stalled safe head must not cascade batches: {counts:?}", + ); + + // Resume safe-head progress: mine a block and reset the faketime offset. + // On respawn, sync refreshes `last_safe_progress_ms` to faketime-now, + // wall-clock arm sees `missed = 0`, `Safe` → `Proceed`, sequencer stays up. + runtime.mine_l1_blocks(1).await?; + runtime.set_faketime_offset(None)?; + runtime.respawn().await?; + + let stable_after_progress = runtime.observe_for(SAFE_HEAD_SYNC_WINDOW).await?; + assert!( + stable_after_progress.is_none(), + "once safe-head progress resumes, the sequencer should boot and remain stable; got {stable_after_progress:?}", + ); + + Ok(()) +} + +// provider outage in the pre-danger zone while the sequencer stays +// running. +// +// Load-under-outage check: the sequencer must continue to accept user ops, +// persist them, broadcast on WS, and CLOSE BATCHES BY SIZE while its L1 +// connection is down. Proves the inclusion lane is independent of L1 +// reachability — as long as the wall-clock fallback keeps the pre-danger +// verdict, the sequencer keeps doing useful work. +// +// Scenario: +// 1. Spawn + apply a large deposit so Alice can fund many transfers. +// 2. Route the sequencer through a proxy (stop → set override → respawn). +// 3. Disconnect the proxy, advance L1 by a pre-danger amount (500 blocks). +// 4. Submit enough transfers (~150 × ~100 B each ≈ 15 KB) to exceed the +// default ~12 KB batch-size target, guaranteeing at least one size- +// triggered batch close during the outage. +// 5. Assert `count_batches().sealed` strictly increased during the outage. +// 6. Reconnect the proxy; confirm one more transfer goes through and the +// schema invariants hold post-test. +async fn run_provider_outage_pre_danger_sequencer_continues_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pre-danger budget: see module-level `PRE_DANGER_BLOCKS` (500 blocks = + // 100min at 12s/block, well below the danger threshold). + // Enough transfers to exceed the default ~12 KB batch size target. Each + // transfer user_op is ≈ 100 B (SSZ-encoded Transfer + signature + nonce), + // so 150 ops ≈ 15 KB — one batch close is guaranteed; two or more is + // typical. + const TRANSFERS_DURING_OUTAGE: usize = 150; + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Deposit big — Alice needs to cover 150+ transfers and their fees. + // Default fee per user-op ≈ 3873 units (log-fee 1060); reserve margin. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let deposit_amount = U256::from(10_000_000_u64); + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit(runtime, &mut ws, &mut replay, &alice_l1, deposit_amount) + .await?; + } + + // Step 2: Insert the proxy and route the sequencer through it via + // stop → set override → respawn. The initial spawn (direct to Anvil) is + // treated as setup only; from here on, all sequencer → L1 traffic flows + // through the proxy. + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + // Step 3: Connect a fresh WS (catches up the deposit from offset 0) and + // a fresh L2 wallet. Consume the deposit replay so subsequent + // `expect_user_op_from` calls line up. + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + + // Baseline: one transfer while the proxy is still connected, confirming + // end-to-end plumbing works through the proxy. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + let batches_before = runtime.count_batches()?; + + // Step 4: Cut the L1 connection, advance Anvil by 500 blocks (pre-danger). + // The sequencer is still running; its wall-clock fallback sees real time + // not yet past the threshold, so it keeps retrying rather than shutting + // down. + proxy.disconnect(); + runtime.mine_l1_blocks(PRE_DANGER_BLOCKS).await?; + + // Step 5: Submit many transfers during the outage. Each should be + // accepted (POST /tx succeeds), broadcast on WS, and eventually packed + // into a new batch. Size-triggered close fires when the cumulative user-op + // bytes exceed the default target. + for _ in 0..TRANSFERS_DURING_OUTAGE { + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Step 6: Batch closure during outage — the whole point of this test. + let batches_mid = runtime.count_batches()?; + assert!( + batches_mid.sealed > batches_before.sealed, + "sequencer must continue closing batches during L1 outage: \ + before={before:?}, after={after:?}", + before = batches_before, + after = batches_mid, + ); + + // Step 7: Restore L1 connectivity. The batch submitter's next tick + // reaches L1 again and starts draining the pending batches. + proxy.reconnect(); + + // Final check: one more transfer goes through after reconnect, proving + // the sequencer didn't just survive — it's fully operational. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Sanity: total sealed batches grew from baseline to final, and nothing + // got invalidated (pre-danger → no recovery triggered). + let batches_final = runtime.count_batches()?; + assert!( + batches_final.sealed > batches_before.sealed, + "final sealed count {final:?} must exceed baseline {before:?}", + final = batches_final, + before = batches_before, + ); + assert_eq!( + batches_final.invalidated, 0, + "pre-danger outage must not invalidate any batches, got {:?}", + batches_final, + ); + + proxy.shutdown().await?; + Ok(()) +} + +// provider outage aging into the danger zone while the sequencer is +// running — sequencer detects via its live wall-clock fallback and self-exits +// with `DangerDetected`. Also verifies the startup freshness checks refuse +// subsequent boots while L1 is still unreachable. +// +// The full "reconnect → recover → no cascade" cycle needs the harness to +// handle an orchestrator-style restart loop (the first post-reconnect boot +// may still trip the danger check and exit, requiring another boot after +// enough blocks age out). That's tracked separately and +// deliberately out of scope here. +// +// Uses dynamic faketime (FAKETIME_TIMESTAMP_FILE re-read on every time call) +// to jump the sequencer's clock past the danger threshold mid-run without +// respawning — the scenario we'd otherwise need 3h45min of real wall-clock +// time to reproduce. +async fn run_provider_outage_danger_zone_sequencer_self_exits_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Defaults: MAX_WAIT_BLOCKS=1200, margin=300, danger_threshold=900 + // blocks at 12s/block = 10800s = 3h. Use 3h55min: comfortably past + // danger, under MAX_WAIT (so no cascade fires later). + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Baseline — deposit + transfer so there's observable state. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(500_000_u64), + ) + .await?; + } + + // Step 2: Switch routing to the proxy (stop → set override → respawn). + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Step 3: Disconnect the proxy and advance both clocks into the danger + // zone. The running `DangerDetector` polls `Storage::check_danger` + // every cadence; with the proxy down the input reader can't refresh + // `last_safe_progress_ms` or the safe block timestamp, so the freshness + // checks eventually emit a non-Safe status and the process exits. + proxy.disconnect(); + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + // Step 4: Wait for the sequencer to detect and self-exit. Dynamic + // faketime means the shift hits the submitter's next tick immediately — + // no real-time wait needed. + let exit_status = runtime.wait_for_exit(Duration::from_secs(30)).await?; + assert!( + !exit_status.success(), + "sequencer must self-exit with non-zero status on danger detection, got {exit_status:?}", + ); + + // Step 5: Try to respawn while proxy is still disconnected. Startup + // runs the same wall-clock fallback via `run_preemptive_recovery` and + // should refuse to boot (`decide_startup_action → Refuse(...)`). + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "respawn must fail while proxy disconnected and wall-clock past danger", + ); + + // No cascade happened yet — batches under MAX_WAIT are not invalidated + // by startup recovery, only preemptively shut-down-and-flushed. + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "danger-zone (not past-stale) must not invalidate batches: {counts:?}", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// Short-duration provider hiccup, heals within pre-danger. +// +// The most-common production fault: an RPC gateway flakes briefly, retries +// succeed. No recovery should fire. +// +// What this tests that the longer-disconnect provider-outage test doesn't: the longer test disconnects for a +// 500-block L1 advance + 150 transfers worth of real time, exercising the +// inclusion lane under load. instead exercises the "pure retry +// loop" path: **no** L1 advance, **no** faketime advance, just a few seconds +// of real-time wall-clock downtime across at least one +// `idle_poll_interval_ms` (default 5 s) so the submitter definitely attempts +// and fails a tick, then the reconnect path lets the next tick succeed. +// +// Scenario: +// 1. Route through proxy; establish a baseline transfer. +// 2. Disconnect, submit one more transfer (inclusion lane must still +// accept), sleep >5 s so the submitter's tick hits the disconnect. +// 3. Reconnect, submit another transfer. +// 4. Assert no batches were invalidated and POST /tx still works. +async fn run_provider_outage_short_hiccup_no_recovery_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Long enough to straddle the default 5 s submitter idle_poll_interval so + // at least one retry actually fails against the disconnected proxy. + const HICCUP_DURATION: Duration = Duration::from_secs(6); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let deposit_amount = U256::from(2_000_000_u64); + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit(runtime, &mut ws, &mut replay, &alice_l1, deposit_amount) + .await?; + } + + // Route through the proxy (stop → override → respawn). + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + + // Baseline transfer via the proxy, proving the proxy path works. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + let batches_before = runtime.count_batches()?; + + // Disconnect: the submitter's next tick (within 5 s) fails against the + // disconnected proxy, runs wall_clock_danger_estimate with ~zero + // elapsed — far below danger threshold — and just retries. + proxy.disconnect(); + + // Inclusion lane is independent of L1; POST /tx still accepts. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Wait at least one full submitter idle_poll_interval (default 5 s) so the + // failed-retry path is definitely exercised under the disconnect. + tokio::time::sleep(HICCUP_DURATION).await; + + proxy.reconnect(); + + // Reconnect: another transfer goes through normally — proves the + // sequencer didn't just sit there, its next tick genuinely recovered. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + let batches_after = runtime.count_batches()?; + assert_eq!( + batches_after.invalidated, 0, + "a short pre-danger hiccup must not invalidate any batch: {batches_after:?}", + ); + assert!( + batches_after.sealed >= batches_before.sealed, + "sealed-batch count must be monotonic across a hiccup: \ + before={batches_before:?}, after={batches_after:?}", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// Both down, sequencer returns first into the danger zone, refuses +// to boot. +// +// Companion to (past-stale cascades through proxy): this is the +// *danger-zone* window of the same setup. Sequencer is stopped AND the proxy +// is disconnected; wall-clock and L1 advance into the danger zone but stay +// below `MAX_WAIT_BLOCKS`; the sequencer comes back first while L1 is still +// unreachable. Startup's wall-clock fallback must see "past danger" and +// refuse the boot — advancing the safe head off stale data would risk +// issuing soft confirmations against a state that may already be doomed. +// +// No cascade is expected yet (we haven't crossed MAX_WAIT_BLOCKS). The test +// stops at the refuse-to-boot assertion — the full reconnect+recovery cycle +// is covered by the both-down-proxy-first restart-cycle test below. +async fn run_both_down_danger_zone_sequencer_first_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Safely inside the danger zone: past 900-block threshold, below 1200. + // 3h55min at 12 s/block = 1175 blocks. + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline: deposit + transfer (both will survive — no cascade expected + // in the danger-zone window). + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Both down: stop sequencer, insert proxy, disconnect proxy. + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + // Coupled advance into the danger zone. Anvil mines behind the proxy + // (direct connection via `mine_l1_blocks`), and faketime shifts the + // sequencer's wall clock cumulatively. + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + // Respawn while proxy is still disconnected: sync fails → wall-clock + // fallback computes past-danger → refuses to boot. + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "sequencer must refuse to boot while L1 unreachable and wall-clock past danger", + ); + + // No cascade should have run yet — we haven't crossed MAX_WAIT_BLOCKS. + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "refuse-to-boot must not invalidate any batch: {counts:?}", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// Both down, proxy returns first, then sequencer — restart cycle +// converges. +// +// Complement to (sequencer first): here L1 comes back before the +// sequencer does. Once the sequencer restarts, startup recovery sees L1 +// reachable and the Tip aged past `danger_threshold`, so `check_danger` +// returns `TipInDanger(idx)` → `decide_startup_action` returns `RecoverTip` → +// `recover_aging_tip` cascades the Tip and opens a fresh one. Convergence +// typically happens on the first respawn. +// +// Other paths can fire under different timings — e.g., the lane might +// close the Tip into a nonced closed batch during boot, the submitter +// might land it on L1 fresh (Gold) or stale (Silver), and a subsequent +// respawn picks up the post-flush cascade or the existing safe_input as +// canonical. The test asserts only convergence to `Stable`, not which +// specific path fires. +// +// The key invariant this tests that the existing .x tests don't: the full +// *restart-loop* works. Earlier tests stopped at "first respawn exits" +// because the harness lacked an orchestrator-restart primitive; now we have +// `respawn_until_stable`, so we can drive the loop to convergence. +async fn run_both_down_danger_zone_proxy_first_restart_cycle_recovers_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + // Each restart attempt advances ~20 min (100 blocks) of additional L1 + // time, simulating the real orchestrator-restart cadence. One extra + // tick past the first failed attempt is enough to push an aged Tip's + // closed-batch form past MAX_WAIT_BLOCKS (1175 + 100 > 1200). + const ADVANCE_PER_RETRY: Duration = blocks_as_duration(RESPAWN_RETRY_ADVANCE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline: deposit + transfer — the transfer will be invalidated when + // cascade finally fires. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Both down: stop sequencer, insert proxy, disconnect. + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + // Coupled advance into the danger zone. + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + // L1 recovers first — proxy back online while sequencer is still stopped. + proxy.reconnect(); + + // Simulated orchestrator loop. Each failed attempt advances L1 (and + // wall-clock) by ~20 min; the aged Tip eventually ages past + // `MAX_WAIT_BLOCKS` and cascade fires on a subsequent respawn. + let outcomes = runtime + .respawn_until_stable(RespawnPolicy { + max_attempts: 5, + stabilization: Duration::from_secs(8), + advance_per_retry: Some(ADVANCE_PER_RETRY), + }) + .await?; + assert!( + matches!(outcomes.last(), Some(RespawnAttemptOutcome::Stable)), + "restart cycle must converge to Stable, got: {outcomes:?}", + ); + + // The cascade fired somewhere in the loop — the transfer was invalidated. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected at least one invalidation after restart-cycle cascade: {counts:?}", + ); + + // Verify via WS replay: only the re-drained deposit appears, the transfer + // is gone. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "Alice must get her full deposit back after cascade", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob's balance must roll back", + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + proxy.shutdown().await?; + Ok(()) +} + +// Sequencer outage, coupled wall+L1 advance into the danger zone, +// orchestrator restart cycle converges. +// +// Realistic counterpart to the decoupled +// (`sequencer_outage_danger_zone_tip_cascade_test`). In a real outage both +// L1 and wall clock advance together. +// +// The most likely outcome on respawn is `RecoverTip`: the open Tip's first +// frame has aged past `danger_threshold` and `check_danger` fires `TipInDanger(idx)` +// (the closed-frontier check finds nothing past the gold frontier). Recovery +// cascades the Tip directly without a flush; on a healthy L1, the first +// respawn typically converges to Stable. +// +// Other paths can fire depending on timing — e.g., the lane might close the +// Tip into a nonced batch before the detector trips, the submitter might +// get the batch onto L1 fresh, and convergence happens by the next respawn +// seeing it in `safe_inputs`. Or the closed batch lands stale, routes +// through `FlushAndCascade`, and converges after a flush cycle. +// +// The test's load-bearing assertion is restart-loop convergence under a +// realistic coupled outage, not which specific recovery path fires nor how +// many attempts that path requires. +async fn run_sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + const ADVANCE_PER_RETRY: Duration = blocks_as_duration(RESPAWN_RETRY_ADVANCE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Sequencer outage: stop, do NOT insert a proxy. Coupled L1+wall advance + // into the danger zone. + runtime.stop().await?; + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + let outcomes = runtime + .respawn_until_stable(RespawnPolicy { + max_attempts: 5, + stabilization: Duration::from_secs(8), + advance_per_retry: Some(ADVANCE_PER_RETRY), + }) + .await?; + assert!( + matches!(outcomes.last(), Some(RespawnAttemptOutcome::Stable)), + "restart cycle must converge to Stable, got: {outcomes:?}", + ); + + // Convergence is the load-bearing claim. The number of attempts depends + // on which recovery path fires (`RecoverTip` typically converges on the + // first respawn; `FlushAndCascade` may take more), so we don't pin a + // minimum here. + assert!( + !outcomes.is_empty(), + "respawn_until_stable must record at least one attempt" + ); + + let counts = runtime.count_batches()?; + + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + if counts.invalidated >= 1 { + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "cascade must roll Alice back to the full deposit", + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + } else { + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(500_000_u64), + "if no cascade fired, the pre-outage transfer must have remained canonical", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::from(100_000_u64), + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 1); + } + + Ok(()) +} + +// follow-up — Provider outage into the danger zone while the +// sequencer is running, mid-run danger-detected exit, then reconnect + restart +// cycle converges. +// +// The provider-outage danger-zone test stops at "refuse to reboot while proxy still +// disconnected". This completes that story: after the sequencer self-exits +// mid-run via its live wall-clock fallback and the proxy reconnects, the +// orchestrator restart cycle eventually converges — same +// `respawn_until_stable` pattern as . +// +// Ordering detail: the wall-clock advance only advances the sequencer's +// clock; the proxy has been disconnecting Anvil traffic, so Anvil's block +// count advanced via `mine_l1_blocks` (which bypasses the proxy). When the +// proxy reconnects, the sequencer sees both the shifted wall clock and the +// fresh safe head via the same RPC connection. +async fn run_provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + const ADVANCE_PER_RETRY: Duration = blocks_as_duration(RESPAWN_RETRY_ADVANCE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline — deposit + transfer while running directly against Anvil, + // then route through the proxy for the outage. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + } + + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + drop(ws); + + // Mid-run outage: proxy goes down, coupled wall+L1 advance into danger. + // The running sequencer's submitter tick hits the disconnect, runs + // danger check, sees a non-Safe status, and exits. + proxy.disconnect(); + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + let exit_status = runtime.wait_for_exit(Duration::from_secs(30)).await?; + assert!( + !exit_status.success(), + "sequencer must self-exit on mid-run danger detection, got {exit_status:?}", + ); + + // L1 comes back. Run the orchestrator cycle: the aged closed batch + // eventually ages past `MAX_WAIT_BLOCKS` and startup recovery cascades. + proxy.reconnect(); + + let outcomes = runtime + .respawn_until_stable(RespawnPolicy { + max_attempts: 5, + stabilization: Duration::from_secs(8), + advance_per_retry: Some(ADVANCE_PER_RETRY), + }) + .await?; + assert!( + matches!(outcomes.last(), Some(RespawnAttemptOutcome::Stable)), + "restart cycle must converge to Stable, got: {outcomes:?}", + ); + + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected cascade after mid-run exit + restart cycle: {counts:?}", + ); + + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + proxy.shutdown().await?; + Ok(()) +} + +// First-boot-with-L1-down refuses to boot (wall-clock fallback +// treats "never synced" as danger). +// +// `wall_clock_danger_estimate` has a distinguished branch for `last_sync_ms +// == 0`: it refuses to proceed because the sequencer has no baseline to +// measure drift against, so issuing soft confirmations against whatever +// stale safe head we last saw is unsafe. There's a unit test covering that +// branch in isolation; this e2e confirms the full `run()` boot path +// respects it end-to-end. +// +// How we reach the condition: the harness's `spawn()` does a successful +// first boot (needs L1 reachable to deploy contracts and pin the deployment +// identity). We stop, rewrite the recorded L1 safe-head observation to +// unknown, then respawn with the proxy disconnected. The deployment identity +// is still populated — so the sequencer gets past the contract-discovery +// phase — but `check_danger` sees the missing safe-head row and +// `decide_startup_action` returns `Refuse(L1ViewStale)`. +// +// Scope note: a "truly" first-ever boot would fail even earlier (no +// deployment identity, can't discover contracts). That's a separate test; this +// one targets only the L1-view freshness branch. +async fn run_first_boot_l1_unreachable_never_synced_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Baseline boot so the deployment identity lands on disk. + { + let _ws = runtime.ws(0).await?; + } + + runtime.stop().await?; + + // Simulate "never synced L1" by clearing the safe-head observation. + runtime.clear_l1_safe_head_observation()?; + + // Route the sequencer through a disconnected proxy so L1 is unreachable + // from the sequencer's perspective. + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "never-synced + L1-unreachable must refuse to boot, got {respawn_result:?}", + ); + + // Confirm the refusal is reversible: reconnect the proxy and the + // sequencer boots normally (the wall-clock fallback path is gated on + // L1 unreachability, not on any persistent flag). + proxy.reconnect(); + runtime.respawn().await?; + + proxy.shutdown().await?; + Ok(()) +} + +// Past-stale closed+submitted batch (delayed-inclusion cascade). +// +// Scenario: a batch closes and the submitter's L1 tx is never mined (the +// gateway dropped it, mempool evicted it, whatever). Blocks accumulate. On +// the sequencer's next startup recovery, the batch's first frame is > +// `MAX_WAIT_BLOCKS` behind current_safe_block, so the scheduler skips it +// in `populate_safe_accepted_batches` and `find_first_batch_in_danger` +// flags it — cascade fires. +// +// This is the structural sibling of (open-batch variant) for +// closed+submitted batches. The `find_first_batch_in_danger` path has two +// flavors: "open batch got old" and "closed batch submission +// got lost" (this one). Both need to cascade correctly; had e2e +// coverage, the closed-submitted variant had none. +// +// Setup shape: we use Anvil's `setAutomine(false)` + `dropAllPendingTxs` +// (new T2 harness primitives) to hold the sequencer's batch-submission tx +// out of the chain, then drop it entirely — cleaner than the mempool-hold +// approach, because `anvil_mine(N)` with a pending tx would include it in +// the first mined block, not the Nth. Dropping simulates gateway packet +// loss directly and advances 1250 genuinely empty blocks. +async fn run_delayed_inclusion_cascades_on_restart_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Past-stale: 1250 blocks > MAX_WAIT_BLOCKS (1200). + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + // Enough transfers to trigger at least one size-based batch close. + // Matches 's sizing (≈100 B/op × 150 ops ≈ 15 KB > 12 KB target). + const TRANSFERS_TO_FORCE_BATCH_CLOSE: usize = 150; + // After the last transfer, wait for the submitter's next tick so it + // picks up the closed batch and sends the L1 tx to the (now-held) + // mempool. Default `idle_poll_interval` is 5 s. + const WAIT_FOR_SUBMITTER_TICK: Duration = Duration::from_secs(7); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Fund Alice generously — 151 transfers + fees is well under 10 M. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let deposit_amount = U256::from(10_000_000_u64); + let mut replay_before = ReplayWalletApp::devnet(); + + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + + // Capture the sealed-batch baseline BEFORE we disable auto-mining so + // we can assert that at least one new batch sealed during the + // mempool-held phase. + let batches_before_close = runtime.count_batches()?; + + // Hold the mempool. From here, txs go to Anvil but don't mine until + // we either re-enable auto-mining or call `anvil_mine`. + runtime.set_automine(false).await?; + + // Submit enough transfers to trigger at least one size-triggered + // batch close. Each POST /tx is processed by the sequencer + // synchronously; the inclusion lane seals a batch when cumulative + // user-op bytes exceed the target. + let mut alice_l2 = runtime.wallet_l2(alice)?; + for _ in 0..TRANSFERS_TO_FORCE_BATCH_CLOSE { + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Give the submitter tick time to fire and put the batch-submission + // tx into the (held) mempool. + tokio::time::sleep(WAIT_FOR_SUBMITTER_TICK).await; + + let batches_after_close = runtime.count_batches()?; + assert!( + batches_after_close.sealed > batches_before_close.sealed, + "expected at least one new sealed batch: before={batches_before_close:?} after={batches_after_close:?}", + ); + + // Shut the sequencer down, then drop the mempool so the submitted + // batch tx never lands. The sequencer's DB still shows a sealed + // batch; L1 has no corresponding event. + drop(ws); + runtime.stop().await?; + runtime.drop_all_pending_txs().await?; + + // Advance past MAX_WAIT_BLOCKS. With auto-mining still off but the + // mempool empty, these are genuinely empty blocks — nothing to + // include. `advance_wall_and_mine` also shifts the sequencer's + // faketime offset so the wall-clock fallback stays in sync with L1. + runtime.advance_wall_and_mine(PAST_STALE).await?; + + // Re-enable auto-mining before respawn: startup recovery's flush step + // submits a no-op at the stuck wallet-nonce slot and needs it mined + // to progress. With auto-mining off, the flusher would hang. + runtime.set_automine(true).await?; + + runtime.respawn().await?; + + // Verify cascade fired. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected cascade-invalidation of the delayed-inclusion batch: {counts:?}", + ); + + // Replay from offset 0: the deposit must be re-drained (it's still a + // safe L1 input), and the sealed batch's transfers must be gone. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "Alice must have her full deposit back (all transfers invalidated)", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob's receiving balance must roll back", + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + Ok(()) +} + +// Aging open Tip trips the runtime detector and the next startup +// runs RecoverTip. +// +// The runtime danger detector intentionally fires on `DangerStatus::TipInDanger(_)`: +// once the open Tip's first frame has aged past `danger_threshold`, the +// inclusion lane has failed to rotate it within `max_batch_open` (2 h ≈ +// 600 blocks, comfortably below `danger_threshold` ≈ 900 blocks), which +// means the lane is stuck. The detector exits the process; the orchestrator +// respawns; startup dispatches `RecoverTip` and `recover_aging_tip` +// cascades the Tip without a flush (the Tip has no L1 footprint). +// +// Earlier behavior tolerated an aging Tip at runtime under the assumption +// that only closed batches with pending L1 transactions could zombie on +// confirm. That left "lane stuck" as a silent failure mode. The new policy +// makes it loud. +// +// Staging: +// 1. Baseline: deposit + transfer → Tip at first_frame_safe_block X. +// 2. `mine_l1_blocks(DANGER_ZONE_BLOCKS)` — current_safe_block jumps +// ~1150 past X, so the Tip's age clears `danger_threshold`. Wall +// clock untouched (decoupled advance). +// 3. `wait_for_exit` — input reader catches up; detector ticks; sees +// `DangerStatus::TipInDanger(_)`; process exits non-zero. +// 4. Respawn — startup `check_danger` again sees Tip in danger → +// `RecoverTip` → `recover_aging_tip` cascades the Tip + opens a fresh +// one. Alice's pre-outage transfer was a soft confirmation against +// the cascaded Tip; it's rolled back. +async fn run_aging_open_tip_runtime_danger_zone_exit_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // L1 jumps into the danger window; wall clock stays put. + runtime.mine_l1_blocks(DANGER_ZONE_BLOCKS).await?; + + // The detector must trip on `DangerStatus::TipInDanger` once the input reader + // catches up. Allow a window for input-reader poll (~2 s) plus + // detector poll (2 s) plus margin. + let exit = runtime.wait_for_exit(Duration::from_secs(15)).await?; + assert!( + !exit.success(), + "sequencer must exit non-zero on `DangerStatus::TipInDanger` once the Tip's \ + first frame ages past `danger_threshold`, got {exit:?}", + ); + + // No cascade fires on detector exit alone. The recovery happens at + // the next startup: `RecoverTip` → `recover_aging_tip` cascades the + // Tip and opens a fresh one. Alice's transfer was inside the Tip; + // it's rolled back. + let counts_before = runtime.count_batches()?; + assert_eq!( + counts_before.invalidated, 0, + "detector exit alone must not invalidate batches; that happens at startup: {counts_before:?}", + ); + + runtime.respawn().await?; + + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "Tip cascade must roll Alice back to her deposit", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob never received the rolled-back transfer", + ); + assert_eq!( + replay_after.current_user_nonce(alice_address), + 0, + "nonce must reset when the Tip carrying the user op is cascaded", + ); + + let counts_after = runtime.count_batches()?; + assert!( + counts_after.invalidated >= 1, + "RecoverTip must invalidate at least the cascaded Tip; got {counts_after:?}", + ); + + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + Ok(()) +} + +// Provider reachable, safe head frozen, live freshness check fires and the +// running detector self-exits. +// +// Runtime twin of . Tests that the wall-clock-adjusted danger arm +// fires at runtime when L1 stays reachable but the safe head stops +// advancing. We don't pre-age the Tip — the detector now exits on +// `DangerStatus::TipInDanger` independently, which would conflate test signals. +// Instead, we let the Tip stay fresh and jump the wall clock past +// `danger_threshold * SECONDS_PER_BLOCK` so the wall-clock arm saturates +// to threshold 0 and catches the fresh Tip via `find_first_batch_in_danger`. +async fn run_stalled_safe_head_live_exit_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // 4 hours: same rationale as `run_stalled_safe_head_startup_refuses_boot_test`. + // missed_blocks > danger_threshold → adjusted threshold saturates to 0 + // → wall-clock arm catches even a fresh Tip. + const STALLED_SAFE_HEAD_OFFSET: &str = "+14400s"; + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Don't mine any L1 blocks — keep the safe head where it is. Just + // jump the wall clock far enough that the wall-clock arm fires. + runtime.set_faketime_offset(Some(STALLED_SAFE_HEAD_OFFSET.to_string()))?; + + let exit = runtime.wait_for_exit(Duration::from_secs(15)).await?; + assert!( + !exit.success(), + "reachable-but-stalled safe head must force a non-zero self-exit via \ + the wall-clock-adjusted danger arm, got {exit:?}", + ); + + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "live stalled-safe-head shutdown must not cascade batches on its own: {counts:?}", + ); + + Ok(()) +} + +// Reconnect at a previously-observed offset that got invalidated +// after the WS connection dropped. +// +// A WS connection cannot span invalidation: the sequencer necessarily exits +// (danger detection or stop) before any cascade runs (`recover_post_flush` or +// `recover_aging_tip`), and the socket dies with the process. The +// meaningful invariant is the **reconnect** behavior — +// a client that reconnects at `from_offset=N`, where `N` was an offset it +// previously received and whose row is *now invalidated*, must see the +// cursor skip cleanly past `N` and deliver only post-recovery events. +// +// covers the adjacent case (`from_offset=0`), which trivially walks +// `valid_sequenced_l2_txs` from the start. This case is distinct because +// the query `WHERE offset > N` is pointed at an offset that no longer +// exists in the valid view. +async fn run_ws_reconnect_at_invalidated_offset_skips_cleanly_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Past-stale: matches `recovery_after_stale_batches_test` sizing. + const PAST_STALE: Duration = blocks_as_duration(MAX_WAIT_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay_before = ReplayWalletApp::devnet(); + + // Build up offsets 0 (deposit) and 1 (transfer) and capture the + // transfer's offset so we can later reconnect at it. + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + let transfer_msg = ws.expect_user_op_from(alice_address).await?; + let last_seen_offset = transfer_msg.offset(); + replay_before.apply(transfer_msg)?; + + // Kill the WS socket and the sequencer (same way a real reconnect arc + // works — process dies, client dials back in). + drop(ws); + runtime.stop().await?; + + runtime.advance_wall_and_mine(PAST_STALE).await?; + runtime.respawn().await?; + + // Reconnect at the last offset the client observed — now invalidated. + // The query `WHERE offset > last_seen_offset` against + // `valid_sequenced_l2_txs` must skip cleanly past the invalidated + // rows and deliver only the post-recovery events (the re-drained + // deposit). + let mut ws_after = runtime.ws(last_seen_offset).await?; + let redrained = ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + // The re-drained deposit's offset is strictly greater than the + // last-seen offset — if the cursor ever delivered an invalidated row + // or the same offset again, that'd be the regression. + assert!( + redrained.offset() > last_seen_offset, + "re-drained event must have a strictly-greater offset: \ + last_seen={last_seen_offset}, redrained={}", + redrained.offset(), + ); + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + // Sanity check: also reconnecting at 0 produces the same single event + // ('s property), to rule out any one-off weirdness in the + // non-zero reconnect path. + drop(ws_after); + let mut ws_from_zero = runtime.ws(0).await?; + let redrained_from_zero = ws_from_zero + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + assert_eq!( + redrained.offset(), + redrained_from_zero.offset(), + "reconnect-at-invalidated and reconnect-at-zero must deliver the \ + same next valid event", + ); + ws_from_zero + .expect_no_message_for(NO_WS_MESSAGE_WAIT) + .await?; + + Ok(()) +} + +// `from_offset=future` waits silently without erroring. +// +// A subscribe at a far-future offset is a valid subscription that should +// behave the same way `from_offset=0` does on an empty feed: sit idle on +// the live broadcast channel until an event with a greater offset arrives, +// no error, no close. +// +// The behavior is deliberately consistent with `from_offset=0` on an empty +// head — otherwise we'd be making the wait-for-something-new path differ +// based on whether history exists. Test pins this as part of the WS +// subscription contract. +async fn run_ws_subscribe_from_future_offset_waits_silently_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Comfortably beyond any offset this test will produce. `sequenced_l2_txs` + // is rowid-based; rowid_u64 ≤ a few by the end of the short workload. + const FUTURE_OFFSET: u64 = 1_000_000; + // Enough real time to observe "waits silently" without being slow. + const WAIT_WINDOW: Duration = Duration::from_secs(2); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Seed some actual events so we're not testing "empty head, future + // offset" (trivial case). We want "non-trivial head, offset beyond it". + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(500_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Subscribe far beyond the current head. The subscribe itself must + // succeed (no 4xx / WS close code), and the resulting stream must be + // quiet until something with a greater offset arrives. + let mut ws_future = runtime.ws(FUTURE_OFFSET).await?; + ws_future.expect_no_message_for(WAIT_WINDOW).await?; + + // Generate more activity. These events are still at offsets far below + // `FUTURE_OFFSET`, so they must not be delivered — the subscription + // keeps waiting. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + ws_future.expect_no_message_for(WAIT_WINDOW).await?; + + Ok(()) +} + +// Safe direct input that was NOT yet drained before the cascade +// must be drained into the recovery batch's first frame. +// +// Distinct from `recovery_after_stale_batches_test`, where the +// direct input was drained into an invalidated batch and gets *re*-drained +// on recovery. Here we exercise the simpler case: the input hit the +// sequencer's view post-stop, so it was never referenced by any frame; +// recovery must include it in the fresh batch's leading range. +// +// Setup: +// 1. Spawn + stop immediately. Initial Tip is empty and anchored at an +// early safe_block. +// 2. Deposit on L1 directly (sequencer is stopped, so the event isn't +// consumed yet). +// 3. Advance L1 past MAX_WAIT_BLOCKS to age the empty initial Tip past +// stale. +// 4. Respawn. Startup recovery syncs the new safe head, sees the +// deposit in `safe_inputs`, cascades the aged initial Tip, and opens +// a recovery batch with `leading_range = [next_undrained, end)` — +// including the undrained deposit. +// 5. WS replay at offset 0 must deliver the deposit event (drained +// exactly once, into the recovery batch's first frame). +async fn run_recovery_drains_safe_but_undrained_direct_input_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let alice_address = alice.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + + // Stop the sequencer before any user-level activity. The initial Tip + // is empty and anchored at whatever safe_block the lane saw on first + // boot. + runtime.stop().await?; + + // Deposit happens entirely on L1 while the sequencer is offline — + // WalletL1Client dials Anvil directly, not through the sequencer. + let deposit_amount = U256::from(600_000_u64); + alice_l1.mint_supported_token(deposit_amount).await?; + alice_l1.deposit_supported_token(deposit_amount).await?; + + // Advance L1 past MAX_WAIT_BLOCKS + safe-depth so the aged empty + // initial Tip gets cascaded and the deposit event is safe. + runtime.advance_wall_and_mine(PAST_STALE).await?; + + runtime.respawn().await?; + + // WS from offset 0. Recovery batch's first frame must contain the + // deposit (never drained before), and nothing else. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + let deposit_msg = ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + replay_after.apply(deposit_msg)?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "the deposit, never previously drained, must land in the recovery \ + batch's first frame", + ); + + // Cascade fired on the empty initial Tip. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected the empty initial Tip to be cascaded: {counts:?}", + ); + + Ok(()) +} + +// Recovery batch opens empty when no direct inputs are pending. +// +// Negative control for : same overall shape but with no L1 deposit +// before respawn. The recovery batch's `leading_range` is `[0, 0)` and the +// batch's first frame is empty. WS replay delivers nothing. +async fn run_recovery_batch_opens_empty_when_no_direct_inputs_pending_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + + runtime.stop().await?; + + // No deposits, no user ops. Just age the initial Tip past stale. + runtime.advance_wall_and_mine(PAST_STALE).await?; + + runtime.respawn().await?; + + // WS from offset 0 must deliver nothing — the recovery batch is empty. + let mut ws_after = runtime.ws(0).await?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + // Cascade still fired (empty initial Tip past MAX_WAIT). + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected the empty initial Tip to be cascaded even without direct \ + inputs: {counts:?}", + ); + + Ok(()) +} + +// Replay determinism: for any workload accepted live, catch-up +// replay must produce an identical per-user state. +// +// This is the `Application` trait's fundamental contract (see +// `AGENTS.md` §Application-Trait-Contract). Without it, restart +// replay and WS catch-up aren't equivalent to live execution — the +// whole soft-confirmation model collapses. +// +// `restart_and_replay_test` covers a single-user two-op workload; this +// test uses a deliberately diverse multi-user, multi-op mix (three +// senders, deposits interleaved with transfers and withdrawals) and +// asserts a *direct* equality between the live replay (assembled from +// WS events observed during execution) and the post-restart replay +// (assembled from WS catch-up at offset 0). Any per-user balance or +// nonce divergence would signal a non-deterministic application or a +// catch-up bug. +async fn run_replay_matches_live_for_mixed_workload_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let charlie = TestSigner::from_default(3)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + let charlie_address = charlie.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let charlie_l1 = runtime.wallet_l1(charlie.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut bob_l2 = runtime.wallet_l2(bob)?; + let mut charlie_l2 = runtime.wallet_l2(charlie)?; + + let mut ws = runtime.ws(0).await?; + let mut replay_live = ReplayWalletApp::devnet(); + + // Diverse workload — exercises deposit-interleaving and every op + // combination supported by the wallet app. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_live, + &alice_l1, + U256::from(1_000_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(400_000_u64)) + .await?; + replay_live.apply(ws.expect_user_op_from(alice_address).await?)?; + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_live, + &charlie_l1, + U256::from(500_000_u64), + ) + .await?; + bob_l2 + .transfer(charlie_address, U256::from(150_000_u64)) + .await?; + replay_live.apply(ws.expect_user_op_from(bob_address).await?)?; + + charlie_l2.withdraw(U256::from(100_000_u64)).await?; + replay_live.apply(ws.expect_user_op_from(charlie_address).await?)?; + + alice_l2 + .transfer(charlie_address, U256::from(50_000_u64)) + .await?; + replay_live.apply(ws.expect_user_op_from(alice_address).await?)?; + + bob_l2.withdraw(U256::from(50_000_u64)).await?; + replay_live.apply(ws.expect_user_op_from(bob_address).await?)?; + + let expected_input_count = replay_live.executed_input_count(); + + // Restart + catch-up replay. Each WS catch-up event feeds the fresh + // replay identically to how the live stream fed the original; if the + // application is deterministic, the two replays must be bit-identical + // across every per-user view the replay exposes. + drop(ws); + runtime.restart().await?; + let mut ws_after = runtime.ws(0).await?; + let mut replay_post = ReplayWalletApp::devnet(); + + // Two deposits + five user ops = seven events. + for _ in 0..expected_input_count { + replay_post.apply(ws_after.next_message().await?)?; + } + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + for addr in [alice_address, bob_address, charlie_address] { + assert_eq!( + replay_post.current_user_balance(addr), + replay_live.current_user_balance(addr), + "balance divergence for {addr:?}: live vs. replay must match", + ); + assert_eq!( + replay_post.current_user_nonce(addr), + replay_live.current_user_nonce(addr), + "nonce divergence for {addr:?}: live vs. replay must match", + ); + } + assert_eq!( + replay_post.executed_input_count(), + replay_live.executed_input_count(), + ); + + Ok(()) +} + +// Transient provider outage: the L1 input reader must +// retry on provider errors (connection refused, timeout) without +// crashing, and pick up the backlog on reconnect. +// +// Distinct from `provider_outage_short_hiccup_no_recovery_test`, which tests +// the batch submitter's retry path via POST activity. Here the interesting +// component is the **input reader**: its only job is polling L1 for new +// events, so the only observable signal that its retry loop works is +// whether a deposit made *during the disconnect* (and thus invisible +// until the proxy comes back) lands on the WS feed after reconnect. +// +// Scenario: +// 1. Route the sequencer through the proxy. +// 2. Disconnect proxy. Alice deposits on L1 (via `WalletL1Client`, +// which dials Anvil directly — bypassing the proxy). +// 3. Advance a few L1 blocks to push the deposit past safe depth. The +// sequencer's reader keeps failing to fetch (connection refused +// from the disconnected proxy) and retrying. +// 4. Reconnect proxy. The reader's next poll succeeds; backlog is +// pulled in; the WS subscriber (still connected) receives the +// deposit event. +// 5. Assert the sequencer didn't crash (no respawn needed, still same +// child) and the deposit landed. +async fn run_provider_outage_input_reader_retries_after_reconnect_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Well below any stale threshold — we just need safe-depth headroom. + const SAFE_DEPTH_HEADROOM_BLOCKS: u64 = 20; + + let alice = TestSigner::from_default(1)?; + let alice_address = alice.address(); + + // Route through the proxy (stop → override → respawn). + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let alice_l1 = runtime.wallet_l1(alice).await?; + let mut ws = runtime.ws(0).await?; + let mut replay = ReplayWalletApp::devnet(); + + // Baseline deposit with the proxy connected — proves the WS + reader + // path works end-to-end before we break it. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(300_000_u64), + ) + .await?; + + // Proxy down. The sequencer's reader polls on a ~2 s cadence; each + // poll will fail with a connection-refused-style provider error until + // we reconnect. + proxy.disconnect(); + + // Deposit while the proxy is down. The L1 wallet bypasses the proxy, + // so Anvil sees the deposit but the sequencer can't. + let late_amount = U256::from(400_000_u64); + alice_l1.mint_supported_token(late_amount).await?; + alice_l1.deposit_supported_token(late_amount).await?; + runtime.mine_l1_blocks(SAFE_DEPTH_HEADROOM_BLOCKS).await?; + + // During the disconnect, the reader should keep retrying rather than + // crashing. Assert the sequencer stays up for a few real seconds + // (long enough for multiple reader polls to fail + retry). + let early_exit = runtime.observe_for(Duration::from_secs(5)).await?; + assert!( + early_exit.is_none(), + "input reader must retry provider errors, not crash the process: \ + got unexpected exit {early_exit:?}", + ); + + // Reconnect. The reader's next poll succeeds, picks up the backlog, + // WS subscriber receives the event. + proxy.reconnect(); + + let late_deposit_msg = ws + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + replay.apply(late_deposit_msg)?; + + assert_eq!( + replay.current_user_balance(alice_address), + U256::from(700_000_u64), + "both deposits should be reflected after reader catches up", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// First-ever boot with no deployment identity + L1 unreachable refuses before +// recovery logic runs. +// +// Distinct from `run_first_boot_l1_unreachable_never_synced_refuses_boot_test` +// (already covered): that test exercises the wall-clock fallback inside +// `run_preemptive_recovery`, which only fires AFTER bootstrap discovery has +// succeeded once (so the deployment identity is pinned). This test targets +// the earlier failure: the +// `InputReader::new` discovery step where the sequencer asks L1 for the +// InputBox address. With no deployment identity, that call has no +// fallback and the boot fails before recovery logic runs. +// +// The harness simulates "no deployment identity" by `reset_database()` after +// a normal boot has populated it (truly first-ever boot would also lack +// a pinned identity, but the failure mode is identical: the bootstrap +// step has nothing to fall back to). +async fn run_first_boot_no_identity_l1_unreachable_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Baseline boot to ensure the schema is fully migrated. We then + // reset the DB to mimic a first-ever boot. + { + let _ws = runtime.ws(0).await?; + } + runtime.stop().await?; + runtime.reset_database()?; + + // Route through a disconnected proxy so InputReader::new fails with + // a provider error. + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "first boot with no deployment identity + L1 unreachable must refuse boot, got {respawn_result:?}", + ); + + // Verify reversibility: reconnect proxy, respawn, this time the + // bootstrap step succeeds and pins the deployment identity. + proxy.reconnect(); + runtime.respawn().await?; + + proxy.shutdown().await?; + Ok(()) +} + +// Chain-id mismatch via the live RPC path. +// +// Companion to `chain_id_mismatch_from_deployment_identity_returns_typed_error` +// in `sequencer/tests/chain_id_validation.rs`. The identity-fallback test runs +// in-process against a pre-seeded DB; this test runs the full sequencer binary +// against real Anvil with a deliberately mismatched `--chain-id`, proving the +// RPC-comparison path refuses before pinning a wrong deployment identity. +// +// The pre-write ordering matters: a regression that swapped the +// identity-write and the chain-id check would leave a bad identity row on +// disk, poisoning future startups. Asserting `respawn_result.is_err()` +// alone catches the bad-error case; we additionally verify a +// post-correction respawn succeeds, which only happens if identity pinning +// wasn't poisoned (bootstrap checks the L1 chain id again, sees it matches, +// then writes the correct deployment identity). +async fn run_chain_id_mismatch_via_live_rpc_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Anvil runs at `DEVNET_CHAIN_ID = 31337`. Pick something obviously + // different that's still valid (chain_id > 0). + const WRONG_CHAIN_ID: u64 = 99_999; + + // Initial boot completes normally (no override). This pins the correct + // deployment identity. + { + let _ws = runtime.ws(0).await?; + } + runtime.stop().await?; + + // Reset the DB so the live RPC path runs (otherwise the identity-fallback + // path would catch the mismatch first). + runtime.reset_database()?; + + // Configure a mismatched chain id and respawn. The bootstrap-time + // RPC check returns the actual chain id (31337), compares it with + // the configured `--chain-id` (99999), and returns ChainIdMismatch. + runtime.set_chain_id_override(Some(WRONG_CHAIN_ID)); + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "chain-id mismatch via live RPC must refuse boot, got {respawn_result:?}", + ); + + // Reset to the correct chain id. Respawn must succeed — proves the + // failed attempt didn't poison the cache or other DB state. + runtime.set_chain_id_override(None); + runtime.respawn().await?; + + Ok(()) +} + +// Nonce-0 first batch recovery edge. +// +// Two coupled invariants: +// - Cascade from no valid ancestor: if the first-ever batch (nonce 0) +// goes stale before any +// batch reaches `Gold` (i.e., before any batch is L1-accepted), +// recovery cascades it and opens a fresh recovery batch that itself +// has nonce 0 (parent NULL — there's no valid ancestor to point +// at). No genesis sentinel exists in the implementation; the +// parent-pointer schema must handle "all batches invalidated" +// natively. +// - Reused nonce acceptance: after recovery, the recovery batch (with +// nonce 0 reused) +// submits to L1, gets accepted by `populate_safe_accepted_batches`, +// and lands in `safe_accepted_batches` — proving the scheduler- +// simulation cursor handles a reused nonce after cascade correctly. +// +// The structural invariants are validated by +// `assert_schema_invariants` (post-test hook in `tests/e2e/src/main.rs`): +// it checks that NULL-parent batches have nonce 0 and that valid-path +// nonces form a contiguous `0..N`. So this test asserts those +// observable consequences plus the explicit `safe_accepted_batches` +// post-condition for . +// +// Setup uses T2 (auto-mining off + drop) so the first batch's L1 +// submission is dropped before reaching the chain — guaranteeing it +// never reaches `Gold` before being cascaded. +async fn run_nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Past stale to ensure the cascade fires. + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + // Force a size-triggered batch close. Same sizing as . + const TRANSFERS_TO_FORCE_CLOSE: usize = 150; + // Submitter idle_poll_interval = 5 s; allow one tick for the batch + // to enter the (held) mempool. + const WAIT_FOR_SUBMITTER_TICK: Duration = Duration::from_secs(7); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + + // Fund Alice and queue many transfers into the open batch (which is + // the FIRST EVER batch — nonce 0). Using auto-mining-off across the + // submitter's tick so the batch's L1 tx hits the mempool but never + // mines, then dropping it. + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(10_000_000_u64), + ) + .await?; + + runtime.set_automine(false).await?; + + for _ in 0..TRANSFERS_TO_FORCE_CLOSE { + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Let the submitter tick fire and put the (nonce-0) batch's L1 + // tx into the held mempool. + tokio::time::sleep(WAIT_FOR_SUBMITTER_TICK).await; + } + + runtime.stop().await?; + runtime.drop_all_pending_txs().await?; + + runtime.advance_wall_and_mine(PAST_STALE).await?; + runtime.set_automine(true).await?; + + runtime.respawn().await?; + + // assertions: the only existing batch (the original nonce-0 + // one) was cascaded, and a recovery batch was opened. The recovery + // batch's invariants (NULL parent → nonce 0) are checked structurally + // by the post-test `assert_schema_invariants` hook. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected the original nonce-0 batch to be invalidated: {counts:?}", + ); + assert!( + counts.total > counts.invalidated, + "recovery batch must exist alongside the invalidated original: {counts:?}", + ); + + // Replay shows the deposit re-drained, transfers gone (rolled back). + // Recreate WS + wallet against the post-respawn HTTP endpoint + // (`runtime.endpoint()` rebinds to a fresh port on every respawn). + let mut ws_after = runtime.ws(0).await?; + let mut alice_l2_fresh = runtime.wallet_l2(alice)?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(10_000_000_u64), + "Alice must have her full deposit back after nonce-0 cascade", + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO,); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + // drive enough work into the recovery batch that the + // submitter closes it by size and submits to L1. With auto-mining + // back on, the submission lands and the input reader picks it up + // into `safe_inputs`; `populate_safe_accepted_batches` accepts it + // at the expected nonce (0, reused). + for _ in 0..TRANSFERS_TO_FORCE_CLOSE { + alice_l2_fresh + .transfer(bob_address, U256::from(1_u64)) + .await?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + } + + // Wait for the submitter to fire a tick + submit the batch. Anvil's + // instamine puts the submission at 1 confirmation; the submitter's + // `wait_for_confirmations` needs `confirmation_depth + 1 = 3`. We + // explicitly mine the remaining 2 blocks below to unblock it without + // having to wait the full 72s timeout. + tokio::time::sleep(Duration::from_secs(7)).await; + runtime.mine_l1_blocks(2).await?; + + // After confirmations land, the submitter's tick loop continues: + // next iteration runs `refresh_recovery_metadata` → + // `populate_safe_accepted_batches_inner`, which appends the batch + // to `safe_accepted_batches` at its expected nonce (0, reused). + tokio::time::sleep(Duration::from_secs(10)).await; + + let (accepted_count, min_accepted_nonce) = runtime.count_safe_accepted_batches()?; + assert!( + accepted_count >= 1, + "expected at least one batch to land in safe_accepted_batches \ + post-recovery (proves reused-nonce-0 was accepted): \ + count={accepted_count}", + ); + assert_eq!( + min_accepted_nonce, + Some(0), + "the first L1-accepted batch must have nonce 0 (reused after \ + cascade) — got {min_accepted_nonce:?}", + ); + + Ok(()) +} + +fn eip712_domain(runtime: &ManagedSequencer) -> alloy_sol_types::Eip712Domain { + sequencer_core::build_input_domain(runtime.domain_chain_id(), runtime.verifying_contract()) } fn ssz_encode_transfer(to: Address, amount: U256) -> Vec { diff --git a/tests/harness/Cargo.toml b/tests/harness/Cargo.toml index 7c9644b..76256a8 100644 --- a/tests/harness/Cargo.toml +++ b/tests/harness/Cargo.toml @@ -17,6 +17,7 @@ alloy-sol-types = "1.4.1" cartesi-rollups-contracts = "=2.2.0" futures-util = "0.3" k256 = "0.13.4" +rusqlite = { version = "0.38.0", features = ["bundled"] } sequencer-core = { path = "../../sequencer-core" } sequencer-rust-client = { path = "../../sdk/rust-client" } serde = { version = "1", features = ["derive"] } diff --git a/tests/harness/src/lib.rs b/tests/harness/src/lib.rs index 2f4d5c5..a528461 100644 --- a/tests/harness/src/lib.rs +++ b/tests/harness/src/lib.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 (see LICENSE) pub mod paths; +pub mod proxy; pub mod replay; pub mod rollups; pub mod sequencer; @@ -11,11 +12,12 @@ pub mod ws; pub type HarnessResult = Result>; +pub use proxy::TcpProxy; pub use replay::ReplayWalletApp; pub use rollups::{DEVNET_CHAIN_ID, DevnetRollupsStack}; pub use sequencer::{ - DEFAULT_DEVNET_SEQUENCER_BIN, DEFAULT_TEST_LOGS_DIR, ManagedSequencer, ManagedSequencerConfig, - default_devnet_sequencer_config, + BatchCounts, DEFAULT_DEVNET_SEQUENCER_BIN, DEFAULT_TEST_LOGS_DIR, ManagedSequencer, + ManagedSequencerConfig, RespawnAttemptOutcome, RespawnPolicy, default_devnet_sequencer_config, }; pub use wallet::{ TestSigner, WalletL1Client, WalletL2Client, address_from_signing_key, sign_user_op_hex, diff --git a/tests/harness/src/proxy.rs b/tests/harness/src/proxy.rs new file mode 100644 index 0000000..778325b --- /dev/null +++ b/tests/harness/src/proxy.rs @@ -0,0 +1,439 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! TCP proxy with programmatic `disconnect()` / `reconnect()` for outage +//! simulation in tests. +//! +//! Layout: +//! +//! ```text +//! Sequencer ──→ TcpProxy (127.0.0.1:proxy_port) ──→ Anvil (upstream) +//! ↑ +//! disconnect() / reconnect() +//! controlled from test code +//! ``` +//! +//! Behavior: +//! +//! - `disconnect()` flips an internal flag. All existing forwarded connections +//! are torn down (their forwarding tasks observe the flag and exit, dropping +//! the sockets). New connection attempts still succeed at the TCP accept +//! level, but are immediately closed. To the sequencer, this looks like the +//! upstream aggressively resets every connection — the same client-visible +//! behavior as a node that went down. +//! +//! - `reconnect()` flips the flag back. Subsequent connections forward +//! normally; the sequencer's next retry after backoff reconnects as if the +//! upstream is back. +//! +//! - Anvil (the real upstream) stays running behind the proxy the whole time, +//! so the test can bypass the proxy to mine blocks on it directly via a +//! separate client connected to the upstream port. That's how we simulate +//! "L1 advanced while the sequencer's gateway was down." +//! +//! The proxy listens on `127.0.0.1:0` by default, picking an ephemeral port +//! the OS hands out; the actual port is read back via `endpoint()`. + +use std::net::SocketAddr; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::{TcpListener, TcpStream}; +use tokio::task::JoinHandle; + +use crate::HarnessResult; +use crate::util::io_other; + +/// A programmable TCP proxy for L1 RPC outage simulation. +/// +/// Construct with [`TcpProxy::spawn`]. Flip the outage flag via +/// [`TcpProxy::disconnect`] / [`TcpProxy::reconnect`]. Retrieve the HTTP +/// endpoint for the sequencer to connect to via [`TcpProxy::endpoint`]. +pub struct TcpProxy { + listen_addr: SocketAddr, + upstream_addr: SocketAddr, + connected: Arc, + accept_task: Option>, + shutdown: Arc, +} + +impl TcpProxy { + /// Spawn a proxy forwarding to `upstream_url` (e.g., `http://127.0.0.1:8545`). + /// + /// The proxy binds `127.0.0.1:0` (an ephemeral port) and starts accepting + /// immediately. Use [`Self::endpoint`] to get the `http://127.0.0.1:` + /// URL for the sequencer to connect to. + pub async fn spawn(upstream_url: &str) -> HarnessResult { + let upstream_addr = parse_http_upstream(upstream_url)?; + let listener = TcpListener::bind("127.0.0.1:0") + .await + .map_err(|err| io_other(format!("proxy bind failed: {err}")))?; + let listen_addr = listener + .local_addr() + .map_err(|err| io_other(format!("proxy local_addr failed: {err}")))?; + + let connected = Arc::new(AtomicBool::new(true)); + let shutdown = Arc::new(AtomicBool::new(false)); + + let accept_task = { + let connected = connected.clone(); + let shutdown = shutdown.clone(); + tokio::spawn(async move { + accept_loop(listener, upstream_addr, connected, shutdown).await; + }) + }; + + Ok(Self { + listen_addr, + upstream_addr, + connected, + accept_task: Some(accept_task), + shutdown, + }) + } + + /// HTTP URL the sequencer should dial (e.g., `http://127.0.0.1:54321`). + pub fn endpoint(&self) -> String { + format!("http://{}", self.listen_addr) + } + + /// TCP address the proxy listens on. + pub fn listen_addr(&self) -> SocketAddr { + self.listen_addr + } + + /// Upstream Anvil TCP address (so tests can bypass the proxy to mine blocks). + pub fn upstream_addr(&self) -> SocketAddr { + self.upstream_addr + } + + /// Simulate upstream outage. All active connections are torn down and + /// future connection attempts are immediately closed. + /// + /// Idempotent: calling while already disconnected is a no-op. + pub fn disconnect(&self) { + self.connected.store(false, Ordering::SeqCst); + } + + /// Restore forwarding. Future connections forward to the upstream normally. + /// + /// Idempotent: calling while already connected is a no-op. Note that + /// existing TCP sockets that were torn down during `disconnect()` remain + /// closed; clients must establish new connections. + pub fn reconnect(&self) { + self.connected.store(true, Ordering::SeqCst); + } + + /// Returns `true` if the proxy is currently forwarding. + pub fn is_connected(&self) -> bool { + self.connected.load(Ordering::SeqCst) + } + + /// Shutdown the proxy cleanly. Called automatically on drop. + pub async fn shutdown(mut self) -> HarnessResult<()> { + self.shutdown.store(true, Ordering::SeqCst); + // Nudge the accept loop by opening a self-connection so it observes + // the shutdown flag on the next iteration. + let _ = TcpStream::connect(self.listen_addr).await; + if let Some(task) = self.accept_task.take() { + task.abort(); + let _ = task.await; + } + Ok(()) + } +} + +impl Drop for TcpProxy { + fn drop(&mut self) { + self.shutdown.store(true, Ordering::SeqCst); + if let Some(task) = self.accept_task.take() { + task.abort(); + } + } +} + +async fn accept_loop( + listener: TcpListener, + upstream_addr: SocketAddr, + connected: Arc, + shutdown: Arc, +) { + loop { + if shutdown.load(Ordering::SeqCst) { + return; + } + let (client, _) = match listener.accept().await { + Ok(pair) => pair, + Err(_) => continue, + }; + + // If the proxy is in "disconnected" mode, accept the TCP connection + // and immediately drop it. This produces the same visible effect as + // an upstream node refusing new connections. + if !connected.load(Ordering::SeqCst) { + drop(client); + continue; + } + + let connected = connected.clone(); + let shutdown = shutdown.clone(); + tokio::spawn(async move { + forward_connection(client, upstream_addr, connected, shutdown).await; + }); + } +} + +async fn forward_connection( + mut client: TcpStream, + upstream_addr: SocketAddr, + connected: Arc, + shutdown: Arc, +) { + let Ok(mut upstream) = TcpStream::connect(upstream_addr).await else { + // Upstream is unreachable — drop client (mirrors a broken forward). + return; + }; + + let (mut client_read, mut client_write) = client.split(); + let (mut upstream_read, mut upstream_write) = upstream.split(); + + // Pump bytes both directions concurrently. Exit on: + // - either half closing cleanly + // - proxy disconnect() being called + // - proxy shutdown + let client_to_upstream = async { + copy_until_disconnected(&mut client_read, &mut upstream_write, &connected, &shutdown).await + }; + let upstream_to_client = async { + copy_until_disconnected(&mut upstream_read, &mut client_write, &connected, &shutdown).await + }; + + // Race: as soon as either direction ends, the whole connection is done. + tokio::select! { + _ = client_to_upstream => {} + _ = upstream_to_client => {} + } +} + +/// Copy bytes until EOF, error, or disconnect/shutdown flag flips. +async fn copy_until_disconnected( + mut reader: R, + mut writer: W, + connected: &AtomicBool, + shutdown: &AtomicBool, +) where + R: AsyncReadExt + Unpin, + W: AsyncWriteExt + Unpin, +{ + // Small buffer is fine; JSON-RPC messages are small. We poll the flags + // between reads so a disconnect() is observed within one read of + // additional latency. + let mut buf = [0_u8; 8 * 1024]; + loop { + if shutdown.load(Ordering::SeqCst) || !connected.load(Ordering::SeqCst) { + return; + } + let read_result = + tokio::time::timeout(std::time::Duration::from_millis(50), reader.read(&mut buf)).await; + let n = match read_result { + Err(_) => continue, // timeout — poll the flags again + Ok(Ok(0)) => return, // clean EOF + Ok(Ok(n)) => n, + Ok(Err(_)) => return, + }; + if writer.write_all(&buf[..n]).await.is_err() { + return; + } + } +} + +fn parse_http_upstream(url: &str) -> HarnessResult { + // Expect `http://host:port` (optionally with a trailing slash). The proxy + // operates at the TCP level, so the scheme must be http(s) and the + // host:port pair must resolve to a single address synchronously. + let stripped = url + .strip_prefix("http://") + .or_else(|| url.strip_prefix("https://")) + .ok_or_else(|| io_other(format!("proxy upstream URL must be http(s)://, got: {url}")))?; + let host_port = stripped + .trim_end_matches('/') + .split('/') + .next() + .unwrap_or(""); + host_port + .parse::() + .map_err(|err| { + io_other(format!( + "proxy upstream URL '{url}' has invalid host:port: {err}" + )) + }) + .map_err(Into::into) +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::io::AsyncReadExt; + + async fn start_echo_server() -> (tokio::task::JoinHandle<()>, SocketAddr) { + let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind"); + let addr = listener.local_addr().expect("local_addr"); + let handle = tokio::spawn(async move { + loop { + let Ok((mut stream, _)) = listener.accept().await else { + return; + }; + tokio::spawn(async move { + let mut buf = [0_u8; 1024]; + while let Ok(n) = stream.read(&mut buf).await { + if n == 0 { + return; + } + if stream.write_all(&buf[..n]).await.is_err() { + return; + } + } + }); + } + }); + (handle, addr) + } + + #[tokio::test] + async fn forwards_bytes_when_connected() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect via proxy"); + client.write_all(b"hello").await.expect("write"); + + let mut buf = [0_u8; 5]; + client.read_exact(&mut buf).await.expect("read"); + assert_eq!(&buf, b"hello"); + } + + #[tokio::test] + async fn disconnect_closes_new_connections() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + proxy.disconnect(); + + // New connection is accepted at TCP level but immediately closed. + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect"); + let _ = client.write_all(b"hello").await; // may succeed or fail + let mut buf = [0_u8; 8]; + // Reading should end quickly. The OS may deliver this as EOF (n=0) or + // as ConnectionReset depending on whether our write raced ahead of + // the proxy's drop. Both are valid "connection closed" signals — we + // just assert the read doesn't hang. + let result = + tokio::time::timeout(std::time::Duration::from_millis(500), client.read(&mut buf)) + .await + .expect("read did not hang"); + match result { + Ok(0) => {} // clean EOF + Err(err) + if matches!( + err.kind(), + std::io::ErrorKind::ConnectionReset + | std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + ) => {} // RST, also valid + other => panic!("disconnected proxy must close the connection, got: {other:?}"), + } + } + + #[tokio::test] + async fn disconnect_tears_down_active_connections() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect"); + client.write_all(b"hi").await.expect("write"); + let mut buf = [0_u8; 2]; + client.read_exact(&mut buf).await.expect("initial read"); + assert_eq!(&buf, b"hi"); + + // Now disconnect. The active socket should be torn down. + proxy.disconnect(); + let mut tail = [0_u8; 8]; + let result = tokio::time::timeout( + std::time::Duration::from_millis(500), + client.read(&mut tail), + ) + .await + .expect("read did not hang"); + match result { + Ok(0) => {} // clean EOF + Err(err) + if matches!( + err.kind(), + std::io::ErrorKind::ConnectionReset + | std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + ) => {} // RST + other => { + panic!("disconnected proxy must tear down existing connections, got: {other:?}") + } + } + } + + #[tokio::test] + async fn reconnect_accepts_new_connections_again() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + + proxy.disconnect(); + // Old socket is dead. Reconnect and try a fresh one. + proxy.reconnect(); + + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect after reconnect"); + client.write_all(b"back").await.expect("write"); + let mut buf = [0_u8; 4]; + client + .read_exact(&mut buf) + .await + .expect("read after reconnect"); + assert_eq!(&buf, b"back"); + } + + #[tokio::test] + async fn is_connected_reflects_state() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + assert!(proxy.is_connected()); + proxy.disconnect(); + assert!(!proxy.is_connected()); + proxy.reconnect(); + assert!(proxy.is_connected()); + } + + #[test] + fn parse_upstream_url_forms() { + assert!(parse_http_upstream("http://127.0.0.1:8545").is_ok()); + assert!(parse_http_upstream("http://127.0.0.1:8545/").is_ok()); + assert!(parse_http_upstream("https://127.0.0.1:8545").is_ok()); + assert!(parse_http_upstream("ws://127.0.0.1:8545").is_err()); + assert!(parse_http_upstream("127.0.0.1:8545").is_err()); + assert!(parse_http_upstream("http://not-a-host").is_err()); + } +} diff --git a/tests/harness/src/rollups.rs b/tests/harness/src/rollups.rs index e14a412..7554f17 100644 --- a/tests/harness/src/rollups.rs +++ b/tests/harness/src/rollups.rs @@ -91,6 +91,18 @@ impl DevnetRollupsStack { self.anvil.mine_blocks(block_count).await } + /// Toggle Anvil's auto-mining mode. When disabled, txs accumulate in + /// the mempool until an explicit `anvil_mine` call (or re-enable). + pub async fn set_automine(&self, enabled: bool) -> HarnessResult<()> { + self.anvil.set_automine(enabled).await + } + + /// Drop every pending tx from Anvil's mempool. Useful for simulating + /// mempool eviction or gateway packet loss. + pub async fn drop_all_pending_txs(&self) -> HarnessResult<()> { + self.anvil.drop_all_pending_txs().await + } + pub async fn shutdown(self) -> HarnessResult<()> { self.anvil.shutdown().await } @@ -196,6 +208,17 @@ impl ManagedAnvil { } async fn mine_blocks(&self, block_count: u64) -> HarnessResult<()> { + // L1 block-time coupling: each mined block advances the L1 + // timestamp by `SECONDS_PER_BLOCK` (Ethereum mainnet parity, also + // what `ManagedSequencer::advance_wall_and_mine` assumes when + // pairing faketime advances with block counts). + // + // Without the explicit `interval`, anvil defaults to 1s/block — + // which then desyncs from faketime, making large advances trip + // spurious `L1ViewStale` even when wall clock and L1 should move + // together. See `ManagedSequencer::advance_wall_and_mine`. + const SECONDS_PER_BLOCK: u64 = 12; + if block_count == 0 { return Ok(()); } @@ -205,7 +228,7 @@ impl ManagedAnvil { .await .map_err(|err| io_other(format!("failed to connect anvil provider: {err}")))?; provider - .anvil_mine(Some(block_count), None) + .anvil_mine(Some(block_count), Some(SECONDS_PER_BLOCK)) .await .map_err(|err| { io_other(format!( @@ -214,6 +237,30 @@ impl ManagedAnvil { })?; Ok(()) } + + async fn set_automine(&self, enabled: bool) -> HarnessResult<()> { + let provider = ProviderBuilder::new() + .connect(self.endpoint.as_str()) + .await + .map_err(|err| io_other(format!("failed to connect anvil provider: {err}")))?; + provider + .anvil_set_auto_mine(enabled) + .await + .map_err(|err| io_other(format!("failed to set auto_mine={enabled}: {err}")))?; + Ok(()) + } + + async fn drop_all_pending_txs(&self) -> HarnessResult<()> { + let provider = ProviderBuilder::new() + .connect(self.endpoint.as_str()) + .await + .map_err(|err| io_other(format!("failed to connect anvil provider: {err}")))?; + provider + .anvil_drop_all_transactions() + .await + .map_err(|err| io_other(format!("failed to drop all pending txs: {err}")))?; + Ok(()) + } } fn read_deployment_address(path: &Path, contract_name: &str) -> HarnessResult
{ diff --git a/tests/harness/src/sequencer.rs b/tests/harness/src/sequencer.rs index e69a69b..f1e5f09 100644 --- a/tests/harness/src/sequencer.rs +++ b/tests/harness/src/sequencer.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 (see LICENSE) use std::fs::{self, OpenOptions}; +use std::io; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::Duration; @@ -34,6 +35,46 @@ pub struct ManagedSequencerConfig { pub logs_dir: PathBuf, } +/// Snapshot of the `batches` table. Returned by +/// [`ManagedSequencer::count_batches`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct BatchCounts { + pub total: u64, + pub sealed: u64, + pub invalidated: u64, +} + +/// Outcome of a single [`ManagedSequencer::respawn_and_watch`] attempt. +#[derive(Debug)] +pub enum RespawnAttemptOutcome { + /// The child came up and stayed alive for the requested stabilization + /// window. + Stable, + /// `respawn()` itself returned `Err` — the child exited during bootstrap + /// before HTTP became ready. Typically surfaces + /// `RecoveryError::Refuse(...)` from the startup decision table. + RespawnFailed(String), + /// `respawn()` returned `Ok` but the child exited within the + /// stabilization window. Typically surfaces a danger-detector worker + /// exit from the first post-boot poll. + ExitedPostRespawn(std::process::ExitStatus), +} + +impl RespawnAttemptOutcome { + pub fn is_stable(&self) -> bool { + matches!(self, Self::Stable) + } +} + +/// Parameters for [`ManagedSequencer::respawn_until_stable`]. See that +/// method's doc for how `advance_per_retry` interacts with the restart cycle. +#[derive(Debug, Clone)] +pub struct RespawnPolicy { + pub max_attempts: u32, + pub stabilization: Duration, + pub advance_per_retry: Option, +} + pub struct ManagedSequencer { rollups: DevnetRollupsStack, child: Child, @@ -45,6 +86,26 @@ pub struct ManagedSequencer { data_dir_path: PathBuf, endpoint: String, log_path: PathBuf, + /// Overrides the `--eth-rpc-url` the sequencer uses. When `None`, the + /// sequencer dials Anvil directly. When `Some(url)`, it dials the + /// override (e.g., a `TcpProxy` in front of Anvil for outage tests). + /// Persists across `respawn()` so post-restart behavior is consistent. + l1_endpoint_override: Option, + /// Overrides the `--chain-id` argument passed to the sequencer binary. + /// When `None`, defaults to `DEVNET_CHAIN_ID` (matches Anvil). Set to + /// a non-matching value to test chain-id-mismatch failure modes + chain_id_override: Option, + /// Path to the file libfaketime re-reads for its offset, on every time + /// call (combined with `FAKETIME_NO_CACHE=1`). Writing to this file + /// shifts the sequencer's view of `SystemTime::now()` / `Instant::now()` + /// immediately — no respawn needed. + faketime_rc_path: PathBuf, + /// Cached libfaketime dylib/so path (computed once on spawn). + libfaketime_path: PathBuf, + /// Internal cumulative forward-offset tracker for + /// [`Self::advance_wall_and_mine`]. Not touched by + /// [`Self::set_faketime_offset`]. + cumulative_offset_secs: u64, } pub fn default_devnet_sequencer_config(log_prefix: impl Into) -> ManagedSequencerConfig { @@ -66,6 +127,16 @@ impl ManagedSequencer { let data_dir = TempDir::new() .map_err(|err| io_other(format!("failed to create temp data dir: {err}")))?; let data_dir_path = data_dir.path().to_path_buf(); + + // Set up faketime: locate libfaketime + create the rc file. Initial + // content `+0` means no offset; tests can overwrite with a new offset + // at any time and the running sequencer will see it on its next + // `SystemTime::now()` / `Instant::now()` call (FAKETIME_NO_CACHE=1). + let libfaketime_path = find_libfaketime()?; + let faketime_rc_path = data_dir_path.join("faketime.rc"); + fs::write(faketime_rc_path.as_path(), "+0\n") + .map_err(|err| io_other(format!("create faketime rc file: {err}")))?; + let SpawnedSequencerProcess { child, endpoint, @@ -76,6 +147,10 @@ impl ManagedSequencer { logs_dir.as_path(), data_dir_path.as_path(), &rollups, + None, + None, + libfaketime_path.as_path(), + faketime_rc_path.as_path(), ) .await?; @@ -90,9 +165,271 @@ impl ManagedSequencer { data_dir_path, endpoint, log_path, + l1_endpoint_override: None, + chain_id_override: None, + faketime_rc_path, + libfaketime_path, + cumulative_offset_secs: 0, }) } + /// Configure the sequencer to dial `l1_endpoint` instead of Anvil directly. + /// The override applies to the *next* `respawn()` and persists until cleared. + /// Intended for tests that route through a [`crate::TcpProxy`]. + /// + /// Does not affect the currently-running sequencer process. + pub fn set_l1_endpoint_override(&mut self, l1_endpoint: Option) { + self.l1_endpoint_override = l1_endpoint; + } + + /// Override the `--chain-id` argument the sequencer is spawned with on + /// the next [`Self::respawn`]. When `None`, defaults to the devnet + /// chain id (matches Anvil). + /// + /// Used by chain-id mismatch tests to inject a mismatched chain id and assert + /// that bootstrap refuses before silently pinning a wrong deployment + /// identity. Does not affect + /// the currently-running sequencer process. + pub fn set_chain_id_override(&mut self, chain_id: Option) { + self.chain_id_override = chain_id; + } + + /// Write a faketime offset to the rc file. Effective **immediately** for + /// the running sequencer (if any) and persists across respawns. The + /// libfaketime library re-reads the file on every time call (we pass + /// `FAKETIME_NO_CACHE=1`), so the next `SystemTime::now()` inside the + /// child sees the new offset. + /// + /// Format follows faketime's `-f` flag: `"+5h"`, `"-1h"`, `"+1d"`, or + /// `"+NNNs"` for absolute seconds. Passing `None` resets to `+0`. + /// See `man faketime` for advanced options (speed-up, interval mode). + /// + /// Does not mine L1 blocks — use [`Self::advance_wall_and_mine`] when you + /// want wall-clock and L1 to move together. + /// + /// Replaces any cumulative advance tracked by + /// [`Self::advance_wall_and_mine`], and resets its counter. + pub fn set_faketime_offset(&mut self, offset: Option) -> HarnessResult<()> { + let s = offset.as_deref().unwrap_or("+0"); + fs::write(self.faketime_rc_path.as_path(), format!("{s}\n")) + .map_err(|err| io_other(format!("write faketime rc file: {err}")))?; + self.cumulative_offset_secs = 0; + Ok(()) + } + + /// Delete the sequencer DB file (and its `-wal` / `-shm` siblings), + /// simulating a brand-new install — no pinned identity, no batches, no + /// safe-input rows. Call while the sequencer is stopped. + /// + /// Distinct from "clear only the identity row": that would leave the DB + /// holding `batches` / `safe_inputs` / `l1_safe_head` rows from prior + /// boots, which `IdentityError::OrphanedState` then refuses on the next + /// boot. The right "first boot" simulation is to start from an empty DB. + /// + /// Used by the no-cache-bootstrap and live-RPC chain-id-mismatch tests: + /// both want to exercise the no-cached-identity bootstrap path. + pub fn reset_database(&self) -> HarnessResult<()> { + let db_path = self.data_dir_path.join("sequencer.db"); + for suffix in ["", "-wal", "-shm"] { + let path = db_path.with_extension(format!("db{suffix}")); + match fs::remove_file(path.as_path()) { + Ok(()) => {} + Err(err) if err.kind() == io::ErrorKind::NotFound => {} + Err(err) => return Err(io_other(format!("reset DB ({path:?}): {err}")).into()), + } + } + Ok(()) + } + + /// Rewrite the L1 safe-head observation to "unknown", simulating a DB + /// that has never successfully synced from L1. Call while the sequencer + /// is stopped. + /// + /// Used by the wall-clock-never-synced test: the danger check treats a + /// missing safe-head row as an unusable L1 view and refuses to + /// proceed. Deleting this row while the deployment identity is + /// populated lets us hit that branch without losing the pinned chain + /// ID / InputBox address (which would fail earlier in bootstrap, not + /// in the wall-clock fallback). + pub fn clear_l1_safe_head_observation(&self) -> HarnessResult<()> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open(db_path.as_path()) + .map_err(|err| io_other(format!("open DB: {err}")))?; + conn.execute("DELETE FROM l1_safe_head WHERE singleton_id = 0", []) + .map_err(|err| io_other(format!("reset L1 safe-head observation: {err}")))?; + Ok(()) + } + + /// Read-only snapshot of the `safe_accepted_batches` view: rows + /// recovered from the L1-side scheduler frontier (i.e., batches the + /// sequencer has *observed accepted on chain*). Returns `(count, + /// min_nonce)` — count is the row count, min_nonce is `MIN(nonce)` or + /// `None` if empty. + /// + /// Used by the nonce-0 recovery test to confirm a recovery batch (which reuses nonce 0) + /// actually lands and gets accepted on L1 — proving the + /// `populate_safe_accepted_batches_inner` cursor handles + /// reused-nonce-after-cascade correctly. + pub fn count_safe_accepted_batches(&self) -> HarnessResult<(u64, Option)> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open_with_flags( + db_path.as_path(), + rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY, + ) + .map_err(|err| io_other(format!("open DB read-only: {err}")))?; + + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM safe_accepted_batches", [], |row| { + row.get(0) + }) + .map_err(|err| io_other(format!("count safe_accepted_batches: {err}")))?; + let min_nonce: Option = conn + .query_row("SELECT MIN(nonce) FROM safe_accepted_batches", [], |row| { + row.get(0) + }) + .map_err(|err| io_other(format!("min nonce: {err}")))?; + Ok((count as u64, min_nonce.map(|n| n as u64))) + } + + /// Snapshot of the `batches` table: `(total, sealed, invalidated)`. + /// Reads the DB file read-only; safe to call while the sequencer is + /// running. Useful for asserting that batch closure happened during a + /// test segment (e.g., the sequencer kept processing through an outage). + pub fn count_batches(&self) -> HarnessResult { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open_with_flags( + db_path.as_path(), + rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY, + ) + .map_err(|err| io_other(format!("open DB read-only: {err}")))?; + + let total: i64 = conn + .query_row("SELECT COUNT(*) FROM batches", [], |row| row.get(0)) + .map_err(|err| io_other(format!("count batches: {err}")))?; + let sealed: i64 = conn + .query_row( + "SELECT COUNT(*) FROM batches WHERE sealed_at_ms IS NOT NULL", + [], + |row| row.get(0), + ) + .map_err(|err| io_other(format!("count sealed batches: {err}")))?; + let invalidated: i64 = conn + .query_row( + "SELECT COUNT(*) FROM batches WHERE invalidated_at_ms IS NOT NULL", + [], + |row| row.get(0), + ) + .map_err(|err| io_other(format!("count invalidated batches: {err}")))?; + + Ok(BatchCounts { + total: total as u64, + sealed: sealed as u64, + invalidated: invalidated as u64, + }) + } + + /// Assert the schema-level tree invariants on the sequencer's DB. Runs + /// against the DB file read-only; safe to call whether the sequencer is + /// running or stopped (SQLite WAL + read-only flag handles concurrent + /// writers). + /// + /// Invariants checked: + /// 1. At most one `valid_open_batch` row (partial unique index + /// `ux_single_valid_tip` should guarantee this structurally — + /// we verify it in case the index ever regressed). + /// 2. Every valid batch's `nonce` equals `parent.nonce + 1`, or 0 if + /// `parent_batch_index IS NULL`. + /// 3. Every `parent_batch_index` is NULL or references an existing + /// batch (FK-backed, verified explicitly for cross-DB-tool + /// portability). + /// 4. The nonces on the valid path form a contiguous `0..N` sequence. + /// + /// Panics with a specific violation message if any invariant fails. + /// Harness-only check (no sequencer changes) that catches regressions + /// which slip past user-visible e2e assertions. + pub fn assert_schema_invariants(&self) -> HarnessResult<()> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open_with_flags( + db_path.as_path(), + rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY, + ) + .map_err(|err| io_other(format!("open DB read-only: {err}")))?; + + // 1. At most one valid open batch. + let open_count: i64 = conn + .query_row("SELECT COUNT(*) FROM valid_open_batch", [], |row| { + row.get(0) + }) + .map_err(|err| io_other(format!("count valid_open_batch: {err}")))?; + if open_count > 1 { + panic!("schema invariant: more than one valid Tip ({open_count} rows)"); + } + + // 2. Nonce contiguity via parent. + let mut stmt = conn + .prepare( + "SELECT b.batch_index, b.parent_batch_index, b.nonce, p.nonce \ + FROM batches b LEFT JOIN batches p ON p.batch_index = b.parent_batch_index", + ) + .map_err(|err| io_other(format!("prepare nonce-check: {err}")))?; + let rows: Vec<(i64, Option, i64, Option)> = stmt + .query_map([], |row| { + Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) + }) + .map_err(|err| io_other(format!("query nonce-check: {err}")))? + .collect::>() + .map_err(|err| io_other(format!("collect nonce-check: {err}")))?; + for (bi, parent, nonce, parent_nonce) in &rows { + match (parent, parent_nonce) { + (None, _) => { + if *nonce != 0 { + panic!( + "schema invariant: batch {bi} has NULL parent but nonce {nonce} (expected 0)" + ); + } + } + (Some(p), None) => { + panic!( + "schema invariant: batch {bi}'s parent {p} doesn't exist (FK violation)" + ); + } + (Some(_), Some(pn)) => { + if *nonce != pn + 1 { + panic!( + "schema invariant: batch {bi} nonce={nonce}, expected parent.nonce+1 = {}", + pn + 1 + ); + } + } + } + } + + // 3. Valid-path nonce uniqueness and contiguity. + let mut stmt = conn + .prepare("SELECT nonce FROM valid_batches ORDER BY nonce ASC") + .map_err(|err| io_other(format!("prepare valid-nonces: {err}")))?; + let valid_nonces: Vec = stmt + .query_map([], |row| row.get::<_, i64>(0)) + .map_err(|err| io_other(format!("query valid-nonces: {err}")))? + .collect::>() + .map_err(|err| io_other(format!("collect valid-nonces: {err}")))?; + for pair in valid_nonces.windows(2) { + if pair[0] == pair[1] { + panic!( + "schema invariant: duplicate valid nonce {} in {valid_nonces:?}", + pair[0] + ); + } + } + for (i, &n) in valid_nonces.iter().enumerate() { + if n != i as i64 { + panic!("schema invariant: valid nonces not contiguous: {valid_nonces:?}"); + } + } + + Ok(()) + } + pub fn endpoint(&self) -> &str { self.endpoint.as_str() } @@ -141,8 +478,198 @@ impl ManagedSequencer { self.rollups.mine_l1_blocks(block_count).await } - pub async fn restart(&mut self) -> HarnessResult<()> { - self.shutdown_child().await?; + /// Toggle Anvil's auto-mining mode. When disabled, txs accumulate in + /// the mempool until an explicit mine or re-enable. Used to hold a + /// sequencer's batch-submission tx out of a block while the chain + /// advances, reproducing the "delayed inclusion" fault that the + /// scheduler handles by skipping past-stale batches. + pub async fn set_automine(&self, enabled: bool) -> HarnessResult<()> { + self.rollups.set_automine(enabled).await + } + + /// Drop every pending tx from Anvil's mempool. Typical use: after the + /// sequencer has submitted a batch-submission tx, drop it to simulate + /// a gateway losing the payload. Combined with `mine_l1_blocks` to + /// advance the chain without the dropped tx landing, this reproduces + /// the "tx never mined" variant of delayed-inclusion. + pub async fn drop_all_pending_txs(&self) -> HarnessResult<()> { + self.rollups.drop_all_pending_txs().await + } + + /// Advance both the sequencer's wall clock and the L1 chain by `duration`, + /// maintaining the block-time coupling invariant (`seconds_per_block`, + /// default 12 for Ethereum mainnet parity). + /// + /// This is the primary tool for simulating elapsed outage time. Effective + /// **immediately** — works whether the sequencer is running or stopped: + /// - The faketime rc file is updated; the running sequencer's next time + /// call (or a post-respawn first call) sees the shifted clock. + /// - Anvil mines `duration.as_secs() / SECONDS_PER_BLOCK` blocks. + /// + /// **Cumulative**: calling with `1h` twice totals `+2h`, not `+1h`. Use + /// [`Self::set_faketime_offset`] to jump to a specific offset or reset. + /// + /// Tests that need decoupled wall-clock vs L1 (e.g., the `saturating_sub` + /// backward-jump test) should use [`Self::set_faketime_offset`] and + /// [`Self::mine_l1_blocks`] directly. + /// + /// Assumes `SEQ_SECONDS_PER_BLOCK = 12`. If a test changes that via env, + /// this helper's block count will be wrong — prefer the direct dials in + /// that case. + pub async fn advance_wall_and_mine(&mut self, duration: Duration) -> HarnessResult<()> { + const SECONDS_PER_BLOCK: u64 = 12; + let secs = duration.as_secs(); + let blocks = secs / SECONDS_PER_BLOCK; + self.mine_l1_blocks(blocks).await?; + self.cumulative_offset_secs = self.cumulative_offset_secs.saturating_add(secs); + fs::write( + self.faketime_rc_path.as_path(), + format!("+{}s\n", self.cumulative_offset_secs), + ) + .map_err(|err| io_other(format!("write faketime rc file: {err}")))?; + Ok(()) + } + + /// Watch the sequencer child for `grace` time without consuming its + /// exit handle. + /// + /// - Returns `Ok(None)` if the child is still alive when `grace` + /// elapses. The internal `wait()` future is dropped, so subsequent + /// calls to [`Self::wait_for_exit`] / [`Self::respawn_and_watch`] + /// still work. + /// - Returns `Ok(Some(status))` if the child exited inside the + /// window. The exit status is captured and the child is reaped; + /// the caller shouldn't call `wait_for_exit` afterwards (it would + /// hang). + /// + /// Used by negative-control tests that need to assert the sequencer + /// *stayed up* across a condition that, if a bug existed, would make + /// it exit. + pub async fn observe_for( + &mut self, + grace: Duration, + ) -> HarnessResult> { + tokio::select! { + wait_result = self.child.wait() => { + let status = wait_result + .map_err(|err| io_other(format!("child.wait(): {err}")))?; + Ok(Some(status)) + } + _ = tokio::time::sleep(grace) => Ok(None), + } + } + + /// Wait for the sequencer process to exit on its own. Returns the + /// process's exit status. Times out after `timeout` to avoid hanging + /// tests when the process refuses to exit. + /// + /// Used by tests that expect the sequencer to detect a condition + /// (e.g., wall-clock danger) and self-exit with a non-zero status. + /// After this returns, call [`Self::respawn`] to start a fresh process. + pub async fn wait_for_exit( + &mut self, + timeout: Duration, + ) -> HarnessResult { + let status = tokio::time::timeout(timeout, self.child.wait()) + .await + .map_err(|_| { + io_other(format!( + "wait_for_exit: sequencer did not exit within {timeout:?}" + )) + })? + .map_err(|err| io_other(format!("wait_for_exit: {err}")))?; + Ok(status) + } + + /// Respawn the sequencer and watch the child for `stabilization` to + /// confirm it stays alive. Classifies the outcome so tests can model an + /// orchestrator restart cycle without re-deriving the failure modes. + /// + /// There are two distinct "unstable" shapes the sequencer can take: + /// - The child dies during bootstrap (before HTTP readiness), which + /// makes `respawn()` itself return `Err`. Canonical cause: + /// `RecoveryError::Refuse(...)` from the startup decision table + /// when L1 is unreachable and the persisted state looks stalled. + /// - The child comes up (HTTP ready, bootstrap passed), then one of + /// the internal tasks returns a fatal error and the process exits. + /// Canonical cause: danger-detector worker exit when the first + /// post-boot poll sees a batch past `danger_threshold`. + /// + /// The race between bootstrap-finishes and submitter-first-tick is + /// short (the poll interval is 5s by default, but the first tick runs + /// immediately), so both cases can surface for a single logical event — + /// tests should generally treat either as "not stable" and retry. + /// + /// Callers must ensure the previous child is already reaped (via + /// [`Self::stop`] or [`Self::wait_for_exit`]) — same rule as + /// [`Self::respawn`]. + pub async fn respawn_and_watch( + &mut self, + stabilization: Duration, + ) -> HarnessResult { + if let Err(err) = self.respawn().await { + return Ok(RespawnAttemptOutcome::RespawnFailed(err.to_string())); + } + tokio::select! { + wait_result = self.child.wait() => { + let status = wait_result + .map_err(|err| io_other(format!("child.wait(): {err}")))?; + Ok(RespawnAttemptOutcome::ExitedPostRespawn(status)) + } + _ = tokio::time::sleep(stabilization) => { + Ok(RespawnAttemptOutcome::Stable) + } + } + } + + /// Loop [`Self::respawn_and_watch`] until the sequencer stays up for + /// `policy.stabilization`, or `policy.max_attempts` is reached. Returns + /// the full sequence of attempts. + /// + /// The restart-loop convergence story: an aged Tip in the danger zone + /// (not yet past-stale) auto-closes on respawn, and the resulting closed + /// batch is in the danger zone, so the detector exits with `DangerDetected`. + /// Startup recovery's cascade fires at `MAX_WAIT_BLOCKS`, not at the + /// danger threshold — so the loop only converges once enough *additional* + /// L1 blocks have aged the batch past `MAX_WAIT_BLOCKS`. In production + /// the orchestrator restart itself takes seconds, during which real L1 + /// blocks are produced; `advance_per_retry` simulates that drift. Tests + /// that expect a short hiccup to self-heal (no danger involved) should + /// leave `advance_per_retry` unset. + /// + /// The loop always returns Ok — assert on the final attempt's outcome + /// to decide pass/fail in the test body. + pub async fn respawn_until_stable( + &mut self, + policy: RespawnPolicy, + ) -> HarnessResult> { + let mut outcomes = Vec::with_capacity(policy.max_attempts as usize); + for attempt in 0..policy.max_attempts { + let outcome = self.respawn_and_watch(policy.stabilization).await?; + let stable = outcome.is_stable(); + outcomes.push(outcome); + if stable { + break; + } + let is_last = attempt + 1 == policy.max_attempts; + if let Some(advance) = policy.advance_per_retry.filter(|_| !is_last) { + self.advance_wall_and_mine(advance).await?; + } + } + Ok(outcomes) + } + + /// Kill the sequencer process. Anvil stays running, so `mine_l1_blocks()` still works. + pub async fn stop(&mut self) -> HarnessResult<()> { + self.shutdown_child().await + } + + /// Respawn the sequencer process using the same data directory and Anvil instance. + /// + /// Honors any `l1_endpoint_override` set via [`Self::set_l1_endpoint_override`] + /// and the faketime offset in the rc file (see [`Self::set_faketime_offset`] / + /// [`Self::advance_wall_and_mine`]). + pub async fn respawn(&mut self) -> HarnessResult<()> { let SpawnedSequencerProcess { child, endpoint, @@ -153,6 +680,10 @@ impl ManagedSequencer { self.logs_dir.as_path(), self.data_dir_path.as_path(), &self.rollups, + self.l1_endpoint_override.as_deref(), + self.chain_id_override, + self.libfaketime_path.as_path(), + self.faketime_rc_path.as_path(), ) .await?; self.child = child; @@ -161,6 +692,16 @@ impl ManagedSequencer { Ok(()) } + pub async fn restart(&mut self) -> HarnessResult<()> { + self.stop().await?; + self.respawn().await + } + + /// Read the current sequencer log file contents. + pub fn read_log_contents(&self) -> HarnessResult { + std::fs::read_to_string(&self.log_path).map_err(Into::into) + } + pub async fn ws(&self, from_offset: u64) -> HarnessResult { let client = self.sequencer_client()?; WsClient::connect(&client, from_offset).await @@ -222,12 +763,17 @@ struct SpawnedSequencerProcess { log_path: PathBuf, } +#[allow(clippy::too_many_arguments)] async fn spawn_sequencer_process( sequencer_bin: &Path, log_prefix: &str, logs_dir: &Path, data_dir: &Path, rollups: &DevnetRollupsStack, + l1_endpoint_override: Option<&str>, + chain_id_override: Option, + libfaketime_path: &Path, + faketime_rc_path: &Path, ) -> HarnessResult { let (endpoint, http_addr) = build_local_endpoint()?; let log_path = timestamped_log_path(logs_dir, log_prefix); @@ -241,15 +787,27 @@ async fn spawn_sequencer_process( let batch_submitter_key = default_private_keys().first().cloned().unwrap_or_else(|| { "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80".to_string() }); - let mut child = Command::new(path_as_str(sequencer_bin)?) + let eth_rpc_url = l1_endpoint_override.unwrap_or_else(|| rollups.l1_endpoint()); + + // Set up libfaketime via env vars (not the `faketime` wrapper binary). + // The wrapper sets the FAKETIME env var, which has priority over + // FAKETIME_TIMESTAMP_FILE — bypassing it lets the file-based mechanism + // work. The file's contents are re-read on every `SystemTime::now()` / + // `Instant::now()` call thanks to FAKETIME_NO_CACHE=1, so tests can + // shift the clock dynamically during a run. + let mut cmd = Command::new(path_as_str(sequencer_bin)?); + apply_faketime_env(&mut cmd, libfaketime_path, faketime_rc_path)?; + + let chain_id = chain_id_override.unwrap_or(DEVNET_CHAIN_ID); + let mut child = cmd .arg("--http-addr") .arg(http_addr) .arg("--data-dir") .arg(path_as_str(data_dir)?) .arg("--eth-rpc-url") - .arg(rollups.l1_endpoint()) + .arg(eth_rpc_url) .arg("--chain-id") - .arg(DEVNET_CHAIN_ID.to_string()) + .arg(chain_id.to_string()) .arg("--app-address") .arg(rollups.app_address().to_string()) .arg("--batch-submitter-private-key") @@ -278,3 +836,161 @@ async fn spawn_sequencer_process( log_path, }) } + +/// Configure the child process env to preload libfaketime and point it at +/// the rc file for dynamic offsets. macOS uses `DYLD_INSERT_LIBRARIES` + +/// `DYLD_FORCE_FLAT_NAMESPACE=1`; Linux uses `LD_PRELOAD`. +fn apply_faketime_env( + cmd: &mut Command, + libfaketime_path: &Path, + faketime_rc_path: &Path, +) -> HarnessResult<()> { + let lib = path_as_str(libfaketime_path)?; + let rc = path_as_str(faketime_rc_path)?; + if cfg!(target_os = "macos") { + cmd.env("DYLD_INSERT_LIBRARIES", lib) + .env("DYLD_FORCE_FLAT_NAMESPACE", "1"); + } else { + cmd.env("LD_PRELOAD", lib); + } + cmd.env("FAKETIME_TIMESTAMP_FILE", rc) + .env("FAKETIME_NO_CACHE", "1"); + Ok(()) +} + +/// Locate the libfaketime shared library. Searches: +/// 1. `$LIBFAKETIME_LIB` (explicit override). +/// 2. `lib/faketime/libfaketime.{1.dylib,so.1}` relative to the `faketime` +/// binary's prefix (Nix layout). +/// 3. Linux distro multiarch lib dirs such as +/// `/usr/lib/x86_64-linux-gnu/faketime` (Debian/Ubuntu apt layout). +fn find_libfaketime() -> HarnessResult { + if let Ok(p) = std::env::var("LIBFAKETIME_LIB") { + let p = PathBuf::from(p); + if p.exists() { + return Ok(p); + } + return Err(io_other(format!("LIBFAKETIME_LIB={p:?} does not exist")).into()); + } + + let path = + std::env::var("PATH").map_err(|err| io_other(format!("PATH env var unreadable: {err}")))?; + let faketime_bin = std::env::split_paths(&path) + .map(|p| p.join("faketime")) + .find(|p| p.exists()) + .ok_or_else(|| { + io_other("`faketime` binary not found in PATH; add libfaketime to the dev shell") + })?; + + let prefix = faketime_bin + .parent() + .and_then(|p| p.parent()) + .ok_or_else(|| { + io_other(format!( + "faketime path has no grandparent: {faketime_bin:?}" + )) + })?; + let lib_dirs = candidate_libfaketime_dirs(prefix); + let candidates = libfaketime_file_names(); + if let Some(path) = find_libfaketime_in_dirs(lib_dirs.as_slice(), candidates) { + return Ok(path); + } + + let searched = lib_dirs + .iter() + .map(|p| format!("{p:?}")) + .collect::>() + .join(", "); + Err(io_other(format!( + "libfaketime not found under any searched directory [{searched}] (tried {candidates:?})" + )) + .into()) +} + +fn libfaketime_file_names() -> &'static [&'static str] { + if cfg!(target_os = "macos") { + &["libfaketime.1.dylib", "libfaketime.dylib"] + } else { + &["libfaketime.so.1", "libfaketime.so"] + } +} + +fn candidate_libfaketime_dirs(prefix: &Path) -> Vec { + let mut dirs = Vec::new(); + let lib_dir = prefix.join("lib"); + dirs.push(lib_dir.join("faketime")); + + if cfg!(target_os = "linux") { + if let Ok(entries) = fs::read_dir(&lib_dir) { + let mut multiarch_dirs = entries + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| path.is_dir()) + .filter(|path| path.file_name().is_some_and(|name| name != "faketime")) + .map(|path| path.join("faketime")) + .collect::>(); + multiarch_dirs.sort(); + dirs.extend(multiarch_dirs); + } + dirs.push(prefix.join("lib64").join("faketime")); + } + + dirs.dedup(); + dirs +} + +fn find_libfaketime_in_dirs(lib_dirs: &[PathBuf], candidates: &[&str]) -> Option { + for lib_dir in lib_dirs { + for name in candidates { + let path = lib_dir.join(name); + if path.exists() { + return Some(path); + } + } + } + None +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::{candidate_libfaketime_dirs, find_libfaketime_in_dirs}; + + #[cfg(target_os = "linux")] + #[test] + fn libfaketime_lookup_finds_debian_multiarch_layout() { + let temp = tempfile::TempDir::new().expect("tempdir"); + let prefix = temp.path(); + let multiarch_dir = prefix.join("lib").join("x86_64-linux-gnu").join("faketime"); + fs::create_dir_all(&multiarch_dir).expect("create multiarch faketime dir"); + let expected = multiarch_dir.join("libfaketime.so.1"); + fs::write(&expected, b"fake so").expect("write fake lib"); + + let dirs = candidate_libfaketime_dirs(prefix); + let found = find_libfaketime_in_dirs(dirs.as_slice(), &["libfaketime.so.1"]) + .expect("multiarch lib should be discovered"); + + assert_eq!(found, expected); + } + + #[test] + fn libfaketime_lookup_prefers_direct_prefix_layout() { + let temp = tempfile::TempDir::new().expect("tempdir"); + let prefix = temp.path(); + let direct_dir = prefix.join("lib").join("faketime"); + let multiarch_dir = prefix.join("lib").join("x86_64-linux-gnu").join("faketime"); + fs::create_dir_all(&direct_dir).expect("create direct faketime dir"); + fs::create_dir_all(&multiarch_dir).expect("create multiarch faketime dir"); + let expected = direct_dir.join("libfaketime.so.1"); + let fallback = multiarch_dir.join("libfaketime.so.1"); + fs::write(&expected, b"direct").expect("write direct lib"); + fs::write(&fallback, b"fallback").expect("write fallback lib"); + + let dirs = candidate_libfaketime_dirs(prefix); + let found = find_libfaketime_in_dirs(dirs.as_slice(), &["libfaketime.so.1"]) + .expect("direct lib should be discovered"); + + assert_eq!(found, expected); + } +} diff --git a/tests/harness/src/wallet.rs b/tests/harness/src/wallet.rs index 14768ff..f713c58 100644 --- a/tests/harness/src/wallet.rs +++ b/tests/harness/src/wallet.rs @@ -234,13 +234,7 @@ impl WalletL2Client { endpoint.to_string(), DEFAULT_SEQUENCER_CLIENT_TIMEOUT, )?; - let domain = Eip712Domain { - name: Some("CartesiAppSequencer".to_string().into()), - version: Some("1".to_string().into()), - chain_id: Some(U256::from(chain_id)), - verifying_contract: Some(verifying_contract), - salt: None, - }; + let domain = sequencer_core::build_input_domain(chain_id, verifying_contract); Ok(Self { signer, client,