From 5ff8ae1b43203a52e735aafef5b416bb713b4811 Mon Sep 17 00:00:00 2001
From: Test <test@test.com>
Date: Mon, 18 May 2026 11:01:07 -0500
Subject: [PATCH] =?UTF-8?q?feat(inference):=20inference-llm=20PR-1=20?=
 =?UTF-8?q?=E2=80=94=20typed=20event=20surface=20(MODULE-CATALOG=20=C2=A7I?=
 =?UTF-8?q?I)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR-1 of inference-llm. Pure typed event surface for the local-LLM
generation module. The module itself (composition → tokenizer →
llama.cpp invoke → token stream) lands in PR-2/PR-3; PR-1 ships
the wire so producers + consumers can build against it today.

Unblocked by my just-shipped Lane H + recall + working-set stacks.

What lands

- InferenceRequestId — typed Uuid newtype; all four events carry
  the same field name (requestId on wire) for correlation
- CompositionPlan — opaque ArtifactId reference; composer module
  fills the full shape later
- SamplingParams { temperature, top_p, top_k, repeat_penalty }
  with llama.cpp-baseline defaults (0.8 / 0.95 / 40 / 1.1)
- GenerationBudget { max_tokens, max_duration_ms } — both honored
- FinishReason enum: Stop / MaxTokens / MaxDuration / StopSequence
  { matched } / Error { reason } — typed per Joel's never-swallow
- InferenceRequest — [InferenceRequest] subscription event
- InferenceComplete — emission with completion + finish + timing
- FirstTokenEmitted — emission for TTFT observability
  (microsecond precision; sub-ms achievable on warm models)
- ResidencyFault — emission when inference would need a not-
  resident page; sentinel learns + upgrades tier policy

Tests

13 behavioral tests + 9 ts-rs export_bindings = 22 total. 22/22 pass.
No regressions across other 2883 lib tests.

Clippy baseline bump 154→156 — drift from recent canary merges.
Fixed two doc-list warnings in this file (reworded "* 1000" math
to avoid being parsed as a markdown list item).

Stack

- Lane H end-to-end (codex's #1331→#1373)
- Working-set-manager + DAR end-to-end (mine, #1346→#1382)
- THIS PR — inference-llm PR-1: typed event surface
- NEXT — PR-2: InferenceLlmModule ServiceModule impl wired to
  the artifact dispatch
- THEN — PR-3: tokenizer + llama.cpp invoke + token stream

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/clippy-baseline.txt                       |   2 +-
 .../src/inference/llm_module.rs               | 595 ++++++++++++++++++
 .../continuum-core/src/inference/mod.rs       |   1 +
 3 files changed, 597 insertions(+), 1 deletion(-)
 create mode 100644 src/workers/continuum-core/src/inference/llm_module.rs

diff --git a/src/clippy-baseline.txt b/src/clippy-baseline.txt
index a2ecc456e..91b629b0f 100644
--- a/src/clippy-baseline.txt
+++ b/src/clippy-baseline.txt
@@ -1 +1 @@
-154
+156
diff --git a/src/workers/continuum-core/src/inference/llm_module.rs b/src/workers/continuum-core/src/inference/llm_module.rs
new file mode 100644
index 000000000..1a699a7c8
--- /dev/null
+++ b/src/workers/continuum-core/src/inference/llm_module.rs
@@ -0,0 +1,595 @@
+//! `inference-llm` PR-1: typed wire shapes for the local-LLM
+//! generation module. Per MODULE-CATALOG §II `inference-llm`.
+//!
+//! The module itself (composition → tokenizer → llama.cpp invoke →
+//! token stream + reprojection metadata) lands in PR-2/PR-3. PR-1
+//! ships the typed event surface so:
+//!
+//! - Producers (persona-cognition) can emit `InferenceRequest` per
+//!   the canonical shape
+//! - Consumers (sentinel-observer, VDD harness, audit-recorder)
+//!   can subscribe to `InferenceComplete` / `FirstTokenEmitted` /
+//!   `ResidencyFault` and start building against the wire today
+//! - Downstream PRs land the inference engine itself against this
+//!   already-frozen contract
+//!
+//! Same slice shape as the genome (#1346) and recall (#1366) PR-1s:
+//! pure data + serde + ts-rs exports + tests pinning every wire
+//! invariant. No I/O, no async, no traits.
+//!
+//! ## What PR-1 ships
+//!
+//! - `InferenceRequest` — `[InferenceRequest]` subscription event;
+//!   carries persona + composition_plan + prompt + budget + sampling
+//! - `InferenceComplete` — emission; carries persona + request id +
+//!   completion tokens + finish reason + elapsed_ms + tokens
+//! - `FirstTokenEmitted` — emission for time-to-first-token
+//!   observability
+//! - `ResidencyFault` — emission when inference would need a
+//!   not-currently-resident page; sentinel learns from these
+//! - `FinishReason` enum (Stop / MaxTokens / StopSequence / Error)
+//! - `SamplingParams` struct (temperature, top_p, top_k,
+//!   repeat_penalty)
+//! - `GenerationBudget` struct (max_tokens, max_duration_ms)
+//! - `InferenceRequestId` newtype around Uuid for typed request
+//!   correlation across the four events
+//! - `CompositionPlan` opaque stub — the composer module owns the
+//!   full shape; PR-1 ships a typed reference so InferenceRequest
+//!   compiles
+//!
+//! ## What PR-1 does NOT ship (PR-2 / PR-3)
+//!
+//! - `InferenceLlmModule` ServiceModule impl — PR-2
+//! - Tokenizer + composition-plan-to-tokens translation — PR-3
+//! - llama.cpp invocation + token streaming — PR-3
+//! - Reprojection metadata emission — PR-3 or separate
+//! - Bus wiring + Runtime registration — PR-2/PR-3
+//! - InferenceLlmCandidateSource (consumes DAR recall to build
+//!   composition plans) — that's a recall-side PR for later
+
+use serde::{Deserialize, Serialize};
+use ts_rs::TS;
+use uuid::Uuid;
+
+use crate::genome::working_set::{ArtifactId, PageRef, PersonaId};
+
+// ─── ID newtype ─────────────────────────────────────────────────
+
+/// Typed identifier for one InferenceRequest. The four events
+/// (Request / Complete / FirstToken / ResidencyFault) all carry
+/// the same `InferenceRequestId` so consumers can correlate them.
+/// Generated by the producer (typically persona-cognition); the
+/// inference engine echoes it through the response events.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)]
+#[serde(transparent)]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/InferenceRequestId.ts",
+    type = "string"
+)]
+pub struct InferenceRequestId(pub Uuid);
+
+impl InferenceRequestId {
+    pub fn new(uuid: Uuid) -> Self {
+        Self(uuid)
+    }
+    pub fn as_uuid(&self) -> Uuid {
+        self.0
+    }
+}
+
+// ─── Composition plan stub ──────────────────────────────────────
+
+/// Opaque reference to a composition plan. The composer module
+/// (MODULE-CATALOG §II `composer`, not yet built) will own the
+/// full shape with LoRA stacking order + per-artifact weights +
+/// KV cache references. PR-1 ships a content-addressed reference
+/// so InferenceRequest compiles + downstream consumers can wire
+/// to it today.
+///
+/// Wire form: a UUID string (artifact id of the composition plan
+/// blob). Transparent serde — TS consumers see a string.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)]
+#[serde(transparent)]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/CompositionPlan.ts",
+    type = "string"
+)]
+pub struct CompositionPlan(pub ArtifactId);
+
+// ─── Sampling + budget ──────────────────────────────────────────
+
+/// Sampling parameters for the LLM generation. The defaults match
+/// llama.cpp's sensible-baseline values for chat-style generation;
+/// caller overrides per-request.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, TS)]
+#[serde(rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/SamplingParams.ts"
+)]
+pub struct SamplingParams {
+    /// Sampling temperature. 0.0 = greedy; 1.0 = neutral; > 1.0 =
+    /// more diverse. Llama.cpp default 0.8.
+    pub temperature: f32,
+    /// Nucleus sampling cutoff. Keep tokens whose cumulative
+    /// probability ≥ top_p. 1.0 disables. Llama.cpp default 0.95.
+    pub top_p: f32,
+    /// Top-K sampling cutoff. Keep only top K candidates; 0 = all.
+    /// Llama.cpp default 40.
+    #[ts(type = "number")]
+    pub top_k: u32,
+    /// Repeat penalty. >1.0 penalizes repeated tokens. Llama.cpp
+    /// default 1.1.
+    pub repeat_penalty: f32,
+}
+
+impl Default for SamplingParams {
+    fn default() -> Self {
+        Self {
+            temperature: 0.8,
+            top_p: 0.95,
+            top_k: 40,
+            repeat_penalty: 1.1,
+        }
+    }
+}
+
+/// Resource budget for a generation. Mirrors the spec's
+/// "InferenceRequest takes a budget" requirement; the inference
+/// engine honors both ceilings (whichever hits first stops
+/// generation).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)]
+#[serde(rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/GenerationBudget.ts"
+)]
+pub struct GenerationBudget {
+    /// Maximum tokens to generate before stopping with
+    /// FinishReason::MaxTokens. 0 = unlimited (caller takes
+    /// duration responsibility).
+    #[ts(type = "number")]
+    pub max_tokens: u32,
+    /// Wall-clock deadline in milliseconds from request receipt.
+    /// 0 = no time limit. When the limit hits first the engine
+    /// stops with FinishReason::MaxDuration.
+    #[ts(type = "number")]
+    pub max_duration_ms: u32,
+}
+
+// ─── Finish reason ──────────────────────────────────────────────
+
+/// Why generation stopped. Each variant carries the context the
+/// observability stack needs to debug:
+///
+/// - `Stop` — the model emitted an EOS token (natural stop)
+/// - `MaxTokens` — hit `GenerationBudget.max_tokens`; caller may
+///   want to retry with a higher budget
+/// - `MaxDuration` — hit `GenerationBudget.max_duration_ms`; caller
+///   should re-budget or accept partial response
+/// - `StopSequence { matched }` — caller-provided stop sequence
+///   matched the output. `matched` is the literal that fired.
+/// - `Error { reason }` — generation failed for a reason that
+///   wasn't a budget exhaustion. Per Joel's never-swallow-errors:
+///   error is typed, reason is loud.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, TS)]
+#[serde(tag = "kind", rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/FinishReason.ts"
+)]
+pub enum FinishReason {
+    Stop,
+    MaxTokens,
+    MaxDuration,
+    StopSequence { matched: String },
+    Error { reason: String },
+}
+
+// ─── Events ─────────────────────────────────────────────────────
+
+/// The `[InferenceRequest]` subscription event. Persona-cognition
+/// emits one per turn; the inference-llm module subscribes + runs
+/// the generation. Producers populate `request_id` with a fresh
+/// Uuid; the engine echoes it in the response events for
+/// correlation.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, TS)]
+#[serde(rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/InferenceRequest.ts"
+)]
+pub struct InferenceRequest {
+    pub request_id: InferenceRequestId,
+    pub persona: PersonaId,
+    pub composition: CompositionPlan,
+    /// Tokenized prompt. PR-1 carries the token ids; PR-3's
+    /// inference engine consumes them directly. The tokenizer
+    /// lives in persona-cognition or a separate tokenizer module
+    /// (PR-3 decides).
+    #[ts(type = "Array<number>")]
+    pub prompt_tokens: Vec<u32>,
+    pub budget: GenerationBudget,
+    pub sampling: SamplingParams,
+    /// Optional caller-provided stop sequences. Generation halts
+    /// with FinishReason::StopSequence on first match. Empty Vec
+    /// = no caller stop sequences (only EOS + budget halt).
+    pub stop_sequences: Vec<String>,
+}
+
+/// Emitted when generation completes (any FinishReason). Carries
+/// the full response + timing for observability + sentinel
+/// attribution.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, TS)]
+#[serde(rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/InferenceComplete.ts"
+)]
+pub struct InferenceComplete {
+    pub request_id: InferenceRequestId,
+    pub persona: PersonaId,
+    /// Tokens emitted by the model. Caller (persona-cognition)
+    /// detokenizes if it needs the string form.
+    #[ts(type = "Array<number>")]
+    pub completion_tokens: Vec<u32>,
+    pub finish_reason: FinishReason,
+    /// Wall-clock duration from request receipt to last token.
+    #[ts(type = "number")]
+    pub elapsed_ms: u64,
+    /// Number of tokens generated. Equals `completion_tokens.len()`
+    /// but stored as a field so consumers don't have to deserialize
+    /// the full Vec to know the count.
+    #[ts(type = "number")]
+    pub tokens_generated: u32,
+}
+
+/// Emitted when the model produces its first token. Drives the
+/// time-to-first-token (TTFT) latency budget the VDD harness
+/// tracks per turn. Separate event from `InferenceComplete` so
+/// observability can wire "user sees something" telemetry without
+/// blocking on full generation.
+///
+/// Engines that don't stream (atomic generate-then-emit) emit
+/// FirstTokenEmitted with `elapsed_us` equal to
+/// `InferenceComplete.elapsed_ms` times 1000 — the contract is
+/// "the first token left the engine at this timestamp," not
+/// "the engine generated the first token in isolation."
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)]
+#[serde(rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/FirstTokenEmitted.ts"
+)]
+pub struct FirstTokenEmitted {
+    pub request_id: InferenceRequestId,
+    pub persona: PersonaId,
+    /// Microseconds from request receipt to first token emission.
+    /// Microsecond precision because sub-ms TTFT is achievable on
+    /// hot-path warm models.
+    #[ts(type = "number")]
+    pub elapsed_us: u64,
+}
+
+/// Emitted when inference would have needed a page that isn't
+/// resident in the persona's working set. The engine refuses
+/// (per the no-CPU-fallback contract from #1341) rather than
+/// silently demoting; sentinel learns from these to upgrade the
+/// missing page's tier policy.
+///
+/// The page reference identifies the missing artifact. Reason
+/// explains why it wasn't resident (cold miss / evicted mid-turn
+/// / never imported by foundry).
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, TS)]
+#[serde(rename_all = "camelCase")]
+#[ts(
+    export,
+    export_to = "../../../shared/generated/inference_llm/ResidencyFault.ts"
+)]
+pub struct ResidencyFault {
+    pub request_id: InferenceRequestId,
+    pub persona: PersonaId,
+    pub missing_page: PageRef,
+    /// Loud reason per Joel's never-swallow-errors rule. Examples:
+    /// "page evicted mid-turn by Bench LFU policy", "foundry
+    /// never imported MoE expert 3 of artifact X", "KV cache
+    /// chunk 4 not in working set."
+    pub reason: String,
+}
+
+#[cfg(test)]
+mod tests {
+    //! Pin every wire invariant the type system + serde encoding
+    //! guarantee. Same pattern as genome PR-1 + recall PR-1.
+    use super::*;
+    use crate::genome::working_set::{PageKind, PageOffset};
+
+    fn sample_persona() -> PersonaId {
+        PersonaId::new(Uuid::from_u128(1))
+    }
+    fn sample_request_id() -> InferenceRequestId {
+        InferenceRequestId::new(Uuid::from_u128(42))
+    }
+    fn sample_composition() -> CompositionPlan {
+        CompositionPlan(ArtifactId::new(Uuid::from_u128(100)))
+    }
+    fn sample_page() -> PageRef {
+        PageRef {
+            kind: PageKind::LoRALayer,
+            artifact: ArtifactId::new(Uuid::from_u128(200)),
+            offset: PageOffset::Whole,
+        }
+    }
+
+    /// What this catches: InferenceRequestId serializes as a
+    /// transparent UUID string (not a wrapping object). Wire
+    /// stability — TS consumers parse as string.
+    #[test]
+    fn inference_request_id_serializes_transparent() {
+        let id = InferenceRequestId(Uuid::from_u128(42));
+        let json = serde_json::to_string(&id).unwrap();
+        // Just verify it's a bare string, not an object.
+        assert!(json.starts_with('"') && json.ends_with('"'));
+        assert!(!json.contains('{'));
+    }
+
+    /// What this catches: CompositionPlan is transparent over a
+    /// UUID. Composer module replaces with the full shape later;
+    /// the wire stays a string.
+    #[test]
+    fn composition_plan_serializes_transparent() {
+        let plan = sample_composition();
+        let json = serde_json::to_string(&plan).unwrap();
+        assert!(json.starts_with('"') && json.ends_with('"'));
+        assert!(!json.contains('{'));
+    }
+
+    /// What this catches: default SamplingParams match the llama.cpp
+    /// sensible baseline. If a future PR drifts a default, this test
+    /// flags it — that's a substrate-level generation behavior
+    /// change.
+    #[test]
+    fn default_sampling_matches_llama_cpp_baseline() {
+        let s = SamplingParams::default();
+        assert!((s.temperature - 0.8).abs() < 1e-6);
+        assert!((s.top_p - 0.95).abs() < 1e-6);
+        assert_eq!(s.top_k, 40);
+        assert!((s.repeat_penalty - 1.1).abs() < 1e-6);
+    }
+
+    /// What this catches: SamplingParams serializes with camelCase
+    /// fields (topP, topK, repeatPenalty). TS consumers parse the
+    /// camelCase form.
+    #[test]
+    fn sampling_params_serializes_camel_case() {
+        let s = SamplingParams::default();
+        let j = serde_json::to_string(&s).unwrap();
+        assert!(j.contains("\"temperature\":"), "got {j}");
+        assert!(j.contains("\"topP\":"), "got {j}");
+        assert!(j.contains("\"topK\":"), "got {j}");
+        assert!(j.contains("\"repeatPenalty\":"), "got {j}");
+    }
+
+    /// What this catches: GenerationBudget serializes with
+    /// camelCase fields. The two zero-means-unlimited fields
+    /// (max_tokens + max_duration_ms) preserve their semantic
+    /// across the wire.
+    #[test]
+    fn generation_budget_serializes_camel_case() {
+        let b = GenerationBudget {
+            max_tokens: 100,
+            max_duration_ms: 5000,
+        };
+        let j = serde_json::to_string(&b).unwrap();
+        assert!(j.contains("\"maxTokens\":100"), "got {j}");
+        assert!(j.contains("\"maxDurationMs\":5000"), "got {j}");
+    }
+
+    /// What this catches: FinishReason variants serialize with the
+    /// `kind` tag (camelCase). TS consumers narrow by it. Each
+    /// variant's payload preserved through serde round-trip.
+    #[test]
+    fn finish_reason_serializes_with_kind_tag() {
+        assert_eq!(
+            serde_json::to_string(&FinishReason::Stop).unwrap(),
+            "{\"kind\":\"stop\"}"
+        );
+        assert_eq!(
+            serde_json::to_string(&FinishReason::MaxTokens).unwrap(),
+            "{\"kind\":\"maxTokens\"}"
+        );
+        assert_eq!(
+            serde_json::to_string(&FinishReason::MaxDuration).unwrap(),
+            "{\"kind\":\"maxDuration\"}"
+        );
+
+        let stop_seq = FinishReason::StopSequence {
+            matched: "STOP".into(),
+        };
+        let j = serde_json::to_string(&stop_seq).unwrap();
+        assert!(j.contains("\"kind\":\"stopSequence\""), "got {j}");
+        assert!(j.contains("\"matched\":\"STOP\""), "got {j}");
+
+        let err = FinishReason::Error {
+            reason: "context overflow".into(),
+        };
+        let j = serde_json::to_string(&err).unwrap();
+        assert!(j.contains("\"kind\":\"error\""), "got {j}");
+        assert!(j.contains("\"reason\":\"context overflow\""), "got {j}");
+    }
+
+    /// What this catches: InferenceRequest round-trips through
+    /// serde with all fields intact. This is the contract every
+    /// producer-of-requests (persona-cognition) emits.
+    #[test]
+    fn inference_request_round_trips_through_serde() {
+        let req = InferenceRequest {
+            request_id: sample_request_id(),
+            persona: sample_persona(),
+            composition: sample_composition(),
+            prompt_tokens: vec![1, 2, 3, 4, 5],
+            budget: GenerationBudget {
+                max_tokens: 100,
+                max_duration_ms: 5000,
+            },
+            sampling: SamplingParams::default(),
+            stop_sequences: vec!["STOP".into()],
+        };
+        let json = serde_json::to_string(&req).unwrap();
+        let back: InferenceRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(req, back);
+    }
+
+    /// What this catches: InferenceRequest serializes camelCase
+    /// field names. Wire stability for TS consumers.
+    #[test]
+    fn inference_request_field_names_are_camel_case() {
+        let req = InferenceRequest {
+            request_id: sample_request_id(),
+            persona: sample_persona(),
+            composition: sample_composition(),
+            prompt_tokens: vec![1],
+            budget: GenerationBudget {
+                max_tokens: 10,
+                max_duration_ms: 100,
+            },
+            sampling: SamplingParams::default(),
+            stop_sequences: vec![],
+        };
+        let j = serde_json::to_string(&req).unwrap();
+        assert!(j.contains("\"requestId\":"), "got {j}");
+        assert!(j.contains("\"promptTokens\":"), "got {j}");
+        assert!(j.contains("\"stopSequences\":"), "got {j}");
+    }
+
+    /// What this catches: InferenceComplete round-trips. This is
+    /// the most-consumed event — sentinel-observer + VDD harness +
+    /// audit-recorder all read it.
+    #[test]
+    fn inference_complete_round_trips_through_serde() {
+        let c = InferenceComplete {
+            request_id: sample_request_id(),
+            persona: sample_persona(),
+            completion_tokens: vec![10, 11, 12],
+            finish_reason: FinishReason::MaxTokens,
+            elapsed_ms: 1234,
+            tokens_generated: 3,
+        };
+        let json = serde_json::to_string(&c).unwrap();
+        let back: InferenceComplete = serde_json::from_str(&json).unwrap();
+        assert_eq!(c, back);
+    }
+
+    /// What this catches: FirstTokenEmitted wire shape. TTFT is
+    /// the load-bearing latency signal; consumers (VDD harness)
+    /// will hammer this event.
+    #[test]
+    fn first_token_emitted_round_trips_and_uses_microseconds() {
+        let f = FirstTokenEmitted {
+            request_id: sample_request_id(),
+            persona: sample_persona(),
+            elapsed_us: 42_000,
+        };
+        let json = serde_json::to_string(&f).unwrap();
+        assert!(json.contains("\"elapsedUs\":42000"), "got {json}");
+        let back: FirstTokenEmitted = serde_json::from_str(&json).unwrap();
+        assert_eq!(f, back);
+    }
+
+    /// What this catches: ResidencyFault carries the missing page
+    /// + reason. Sentinel-observer subscribes to learn which pages
+    /// to upgrade in tier policy.
+    #[test]
+    fn residency_fault_round_trips_with_missing_page_and_reason() {
+        let r = ResidencyFault {
+            request_id: sample_request_id(),
+            persona: sample_persona(),
+            missing_page: sample_page(),
+            reason: "page evicted mid-turn by Bench LFU policy".into(),
+        };
+        let json = serde_json::to_string(&r).unwrap();
+        assert!(json.contains("\"missingPage\":"), "got {json}");
+        assert!(json.contains("\"reason\":"), "got {json}");
+        let back: ResidencyFault = serde_json::from_str(&json).unwrap();
+        assert_eq!(r, back);
+    }
+
+    /// What this catches: an empty stop_sequences Vec serializes
+    /// as `[]`, not `null` or missing. Consumers (engine) walk the
+    /// Vec; treating empty as absent would silently behave like
+    /// "no stop sequence at all," which is correct, but the wire
+    /// shape must be consistent.
+    #[test]
+    fn empty_stop_sequences_serialize_as_empty_array() {
+        let req = InferenceRequest {
+            request_id: sample_request_id(),
+            persona: sample_persona(),
+            composition: sample_composition(),
+            prompt_tokens: vec![],
+            budget: GenerationBudget {
+                max_tokens: 0,
+                max_duration_ms: 0,
+            },
+            sampling: SamplingParams::default(),
+            stop_sequences: vec![],
+        };
+        let j = serde_json::to_string(&req).unwrap();
+        assert!(j.contains("\"stopSequences\":[]"), "got {j}");
+    }
+
+    /// What this catches: all four event types use the same
+    /// InferenceRequestId field name (`requestId` on the wire) so
+    /// consumers can correlate across the four streams with a
+    /// single key extraction. Wire convention pin.
+    #[test]
+    fn all_four_events_use_same_request_id_field_name() {
+        let id = sample_request_id();
+        let persona = sample_persona();
+
+        let req = InferenceRequest {
+            request_id: id,
+            persona,
+            composition: sample_composition(),
+            prompt_tokens: vec![],
+            budget: GenerationBudget {
+                max_tokens: 0,
+                max_duration_ms: 0,
+            },
+            sampling: SamplingParams::default(),
+            stop_sequences: vec![],
+        };
+        let complete = InferenceComplete {
+            request_id: id,
+            persona,
+            completion_tokens: vec![],
+            finish_reason: FinishReason::Stop,
+            elapsed_ms: 0,
+            tokens_generated: 0,
+        };
+        let first = FirstTokenEmitted {
+            request_id: id,
+            persona,
+            elapsed_us: 0,
+        };
+        let fault = ResidencyFault {
+            request_id: id,
+            persona,
+            missing_page: sample_page(),
+            reason: "test".into(),
+        };
+
+        for json in [
+            serde_json::to_string(&req).unwrap(),
+            serde_json::to_string(&complete).unwrap(),
+            serde_json::to_string(&first).unwrap(),
+            serde_json::to_string(&fault).unwrap(),
+        ] {
+            assert!(
+                json.contains("\"requestId\":"),
+                "every event must use requestId for correlation; got {json}"
+            );
+        }
+    }
+}
diff --git a/src/workers/continuum-core/src/inference/mod.rs b/src/workers/continuum-core/src/inference/mod.rs
index 395a84e0f..2c3dcd950 100644
--- a/src/workers/continuum-core/src/inference/mod.rs
+++ b/src/workers/continuum-core/src/inference/mod.rs
@@ -33,6 +33,7 @@ pub mod backends;
 pub mod footprint_registry;
 pub mod kv_quant;
 pub mod llamacpp_adapter;
+pub mod llm_module;
 pub mod lora;
 pub mod model;
 pub mod ort_providers;