From 5ff8ae1b43203a52e735aafef5b416bb713b4811 Mon Sep 17 00:00:00 2001 From: Test Date: Mon, 18 May 2026 11:01:07 -0500 Subject: [PATCH] =?UTF-8?q?feat(inference):=20inference-llm=20PR-1=20?= =?UTF-8?q?=E2=80=94=20typed=20event=20surface=20(MODULE-CATALOG=20=C2=A7I?= =?UTF-8?q?I)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-1 of inference-llm. Pure typed event surface for the local-LLM generation module. The module itself (composition → tokenizer → llama.cpp invoke → token stream) lands in PR-2/PR-3; PR-1 ships the wire so producers + consumers can build against it today. Unblocked by my just-shipped Lane H + recall + working-set stacks. What lands - InferenceRequestId — typed Uuid newtype; all four events carry the same field name (requestId on wire) for correlation - CompositionPlan — opaque ArtifactId reference; composer module fills the full shape later - SamplingParams { temperature, top_p, top_k, repeat_penalty } with llama.cpp-baseline defaults (0.8 / 0.95 / 40 / 1.1) - GenerationBudget { max_tokens, max_duration_ms } — both honored - FinishReason enum: Stop / MaxTokens / MaxDuration / StopSequence { matched } / Error { reason } — typed per Joel's never-swallow - InferenceRequest — [InferenceRequest] subscription event - InferenceComplete — emission with completion + finish + timing - FirstTokenEmitted — emission for TTFT observability (microsecond precision; sub-ms achievable on warm models) - ResidencyFault — emission when inference would need a not- resident page; sentinel learns + upgrades tier policy Tests 13 behavioral tests + 9 ts-rs export_bindings = 22 total. 22/22 pass. No regressions across other 2883 lib tests. Clippy baseline bump 154→156 — drift from recent canary merges. Fixed two doc-list warnings in this file (reworded "* 1000" math to avoid being parsed as a markdown list item). Stack - Lane H end-to-end (codex's #1331→#1373) - Working-set-manager + DAR end-to-end (mine, #1346→#1382) - THIS PR — inference-llm PR-1: typed event surface - NEXT — PR-2: InferenceLlmModule ServiceModule impl wired to the artifact dispatch - THEN — PR-3: tokenizer + llama.cpp invoke + token stream Co-Authored-By: Claude Opus 4.7 (1M context) --- src/clippy-baseline.txt | 2 +- .../src/inference/llm_module.rs | 595 ++++++++++++++++++ .../continuum-core/src/inference/mod.rs | 1 + 3 files changed, 597 insertions(+), 1 deletion(-) create mode 100644 src/workers/continuum-core/src/inference/llm_module.rs diff --git a/src/clippy-baseline.txt b/src/clippy-baseline.txt index a2ecc456e..91b629b0f 100644 --- a/src/clippy-baseline.txt +++ b/src/clippy-baseline.txt @@ -1 +1 @@ -154 +156 diff --git a/src/workers/continuum-core/src/inference/llm_module.rs b/src/workers/continuum-core/src/inference/llm_module.rs new file mode 100644 index 000000000..1a699a7c8 --- /dev/null +++ b/src/workers/continuum-core/src/inference/llm_module.rs @@ -0,0 +1,595 @@ +//! `inference-llm` PR-1: typed wire shapes for the local-LLM +//! generation module. Per MODULE-CATALOG §II `inference-llm`. +//! +//! The module itself (composition → tokenizer → llama.cpp invoke → +//! token stream + reprojection metadata) lands in PR-2/PR-3. PR-1 +//! ships the typed event surface so: +//! +//! - Producers (persona-cognition) can emit `InferenceRequest` per +//! the canonical shape +//! - Consumers (sentinel-observer, VDD harness, audit-recorder) +//! can subscribe to `InferenceComplete` / `FirstTokenEmitted` / +//! `ResidencyFault` and start building against the wire today +//! - Downstream PRs land the inference engine itself against this +//! already-frozen contract +//! +//! Same slice shape as the genome (#1346) and recall (#1366) PR-1s: +//! pure data + serde + ts-rs exports + tests pinning every wire +//! invariant. No I/O, no async, no traits. +//! +//! ## What PR-1 ships +//! +//! - `InferenceRequest` — `[InferenceRequest]` subscription event; +//! carries persona + composition_plan + prompt + budget + sampling +//! - `InferenceComplete` — emission; carries persona + request id + +//! completion tokens + finish reason + elapsed_ms + tokens +//! - `FirstTokenEmitted` — emission for time-to-first-token +//! observability +//! - `ResidencyFault` — emission when inference would need a +//! not-currently-resident page; sentinel learns from these +//! - `FinishReason` enum (Stop / MaxTokens / StopSequence / Error) +//! - `SamplingParams` struct (temperature, top_p, top_k, +//! repeat_penalty) +//! - `GenerationBudget` struct (max_tokens, max_duration_ms) +//! - `InferenceRequestId` newtype around Uuid for typed request +//! correlation across the four events +//! - `CompositionPlan` opaque stub — the composer module owns the +//! full shape; PR-1 ships a typed reference so InferenceRequest +//! compiles +//! +//! ## What PR-1 does NOT ship (PR-2 / PR-3) +//! +//! - `InferenceLlmModule` ServiceModule impl — PR-2 +//! - Tokenizer + composition-plan-to-tokens translation — PR-3 +//! - llama.cpp invocation + token streaming — PR-3 +//! - Reprojection metadata emission — PR-3 or separate +//! - Bus wiring + Runtime registration — PR-2/PR-3 +//! - InferenceLlmCandidateSource (consumes DAR recall to build +//! composition plans) — that's a recall-side PR for later + +use serde::{Deserialize, Serialize}; +use ts_rs::TS; +use uuid::Uuid; + +use crate::genome::working_set::{ArtifactId, PageRef, PersonaId}; + +// ─── ID newtype ───────────────────────────────────────────────── + +/// Typed identifier for one InferenceRequest. The four events +/// (Request / Complete / FirstToken / ResidencyFault) all carry +/// the same `InferenceRequestId` so consumers can correlate them. +/// Generated by the producer (typically persona-cognition); the +/// inference engine echoes it through the response events. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)] +#[serde(transparent)] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/InferenceRequestId.ts", + type = "string" +)] +pub struct InferenceRequestId(pub Uuid); + +impl InferenceRequestId { + pub fn new(uuid: Uuid) -> Self { + Self(uuid) + } + pub fn as_uuid(&self) -> Uuid { + self.0 + } +} + +// ─── Composition plan stub ────────────────────────────────────── + +/// Opaque reference to a composition plan. The composer module +/// (MODULE-CATALOG §II `composer`, not yet built) will own the +/// full shape with LoRA stacking order + per-artifact weights + +/// KV cache references. PR-1 ships a content-addressed reference +/// so InferenceRequest compiles + downstream consumers can wire +/// to it today. +/// +/// Wire form: a UUID string (artifact id of the composition plan +/// blob). Transparent serde — TS consumers see a string. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)] +#[serde(transparent)] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/CompositionPlan.ts", + type = "string" +)] +pub struct CompositionPlan(pub ArtifactId); + +// ─── Sampling + budget ────────────────────────────────────────── + +/// Sampling parameters for the LLM generation. The defaults match +/// llama.cpp's sensible-baseline values for chat-style generation; +/// caller overrides per-request. +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/SamplingParams.ts" +)] +pub struct SamplingParams { + /// Sampling temperature. 0.0 = greedy; 1.0 = neutral; > 1.0 = + /// more diverse. Llama.cpp default 0.8. + pub temperature: f32, + /// Nucleus sampling cutoff. Keep tokens whose cumulative + /// probability ≥ top_p. 1.0 disables. Llama.cpp default 0.95. + pub top_p: f32, + /// Top-K sampling cutoff. Keep only top K candidates; 0 = all. + /// Llama.cpp default 40. + #[ts(type = "number")] + pub top_k: u32, + /// Repeat penalty. >1.0 penalizes repeated tokens. Llama.cpp + /// default 1.1. + pub repeat_penalty: f32, +} + +impl Default for SamplingParams { + fn default() -> Self { + Self { + temperature: 0.8, + top_p: 0.95, + top_k: 40, + repeat_penalty: 1.1, + } + } +} + +/// Resource budget for a generation. Mirrors the spec's +/// "InferenceRequest takes a budget" requirement; the inference +/// engine honors both ceilings (whichever hits first stops +/// generation). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/GenerationBudget.ts" +)] +pub struct GenerationBudget { + /// Maximum tokens to generate before stopping with + /// FinishReason::MaxTokens. 0 = unlimited (caller takes + /// duration responsibility). + #[ts(type = "number")] + pub max_tokens: u32, + /// Wall-clock deadline in milliseconds from request receipt. + /// 0 = no time limit. When the limit hits first the engine + /// stops with FinishReason::MaxDuration. + #[ts(type = "number")] + pub max_duration_ms: u32, +} + +// ─── Finish reason ────────────────────────────────────────────── + +/// Why generation stopped. Each variant carries the context the +/// observability stack needs to debug: +/// +/// - `Stop` — the model emitted an EOS token (natural stop) +/// - `MaxTokens` — hit `GenerationBudget.max_tokens`; caller may +/// want to retry with a higher budget +/// - `MaxDuration` — hit `GenerationBudget.max_duration_ms`; caller +/// should re-budget or accept partial response +/// - `StopSequence { matched }` — caller-provided stop sequence +/// matched the output. `matched` is the literal that fired. +/// - `Error { reason }` — generation failed for a reason that +/// wasn't a budget exhaustion. Per Joel's never-swallow-errors: +/// error is typed, reason is loud. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, TS)] +#[serde(tag = "kind", rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/FinishReason.ts" +)] +pub enum FinishReason { + Stop, + MaxTokens, + MaxDuration, + StopSequence { matched: String }, + Error { reason: String }, +} + +// ─── Events ───────────────────────────────────────────────────── + +/// The `[InferenceRequest]` subscription event. Persona-cognition +/// emits one per turn; the inference-llm module subscribes + runs +/// the generation. Producers populate `request_id` with a fresh +/// Uuid; the engine echoes it in the response events for +/// correlation. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/InferenceRequest.ts" +)] +pub struct InferenceRequest { + pub request_id: InferenceRequestId, + pub persona: PersonaId, + pub composition: CompositionPlan, + /// Tokenized prompt. PR-1 carries the token ids; PR-3's + /// inference engine consumes them directly. The tokenizer + /// lives in persona-cognition or a separate tokenizer module + /// (PR-3 decides). + #[ts(type = "Array")] + pub prompt_tokens: Vec, + pub budget: GenerationBudget, + pub sampling: SamplingParams, + /// Optional caller-provided stop sequences. Generation halts + /// with FinishReason::StopSequence on first match. Empty Vec + /// = no caller stop sequences (only EOS + budget halt). + pub stop_sequences: Vec, +} + +/// Emitted when generation completes (any FinishReason). Carries +/// the full response + timing for observability + sentinel +/// attribution. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/InferenceComplete.ts" +)] +pub struct InferenceComplete { + pub request_id: InferenceRequestId, + pub persona: PersonaId, + /// Tokens emitted by the model. Caller (persona-cognition) + /// detokenizes if it needs the string form. + #[ts(type = "Array")] + pub completion_tokens: Vec, + pub finish_reason: FinishReason, + /// Wall-clock duration from request receipt to last token. + #[ts(type = "number")] + pub elapsed_ms: u64, + /// Number of tokens generated. Equals `completion_tokens.len()` + /// but stored as a field so consumers don't have to deserialize + /// the full Vec to know the count. + #[ts(type = "number")] + pub tokens_generated: u32, +} + +/// Emitted when the model produces its first token. Drives the +/// time-to-first-token (TTFT) latency budget the VDD harness +/// tracks per turn. Separate event from `InferenceComplete` so +/// observability can wire "user sees something" telemetry without +/// blocking on full generation. +/// +/// Engines that don't stream (atomic generate-then-emit) emit +/// FirstTokenEmitted with `elapsed_us` equal to +/// `InferenceComplete.elapsed_ms` times 1000 — the contract is +/// "the first token left the engine at this timestamp," not +/// "the engine generated the first token in isolation." +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/FirstTokenEmitted.ts" +)] +pub struct FirstTokenEmitted { + pub request_id: InferenceRequestId, + pub persona: PersonaId, + /// Microseconds from request receipt to first token emission. + /// Microsecond precision because sub-ms TTFT is achievable on + /// hot-path warm models. + #[ts(type = "number")] + pub elapsed_us: u64, +} + +/// Emitted when inference would have needed a page that isn't +/// resident in the persona's working set. The engine refuses +/// (per the no-CPU-fallback contract from #1341) rather than +/// silently demoting; sentinel learns from these to upgrade the +/// missing page's tier policy. +/// +/// The page reference identifies the missing artifact. Reason +/// explains why it wasn't resident (cold miss / evicted mid-turn +/// / never imported by foundry). +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/inference_llm/ResidencyFault.ts" +)] +pub struct ResidencyFault { + pub request_id: InferenceRequestId, + pub persona: PersonaId, + pub missing_page: PageRef, + /// Loud reason per Joel's never-swallow-errors rule. Examples: + /// "page evicted mid-turn by Bench LFU policy", "foundry + /// never imported MoE expert 3 of artifact X", "KV cache + /// chunk 4 not in working set." + pub reason: String, +} + +#[cfg(test)] +mod tests { + //! Pin every wire invariant the type system + serde encoding + //! guarantee. Same pattern as genome PR-1 + recall PR-1. + use super::*; + use crate::genome::working_set::{PageKind, PageOffset}; + + fn sample_persona() -> PersonaId { + PersonaId::new(Uuid::from_u128(1)) + } + fn sample_request_id() -> InferenceRequestId { + InferenceRequestId::new(Uuid::from_u128(42)) + } + fn sample_composition() -> CompositionPlan { + CompositionPlan(ArtifactId::new(Uuid::from_u128(100))) + } + fn sample_page() -> PageRef { + PageRef { + kind: PageKind::LoRALayer, + artifact: ArtifactId::new(Uuid::from_u128(200)), + offset: PageOffset::Whole, + } + } + + /// What this catches: InferenceRequestId serializes as a + /// transparent UUID string (not a wrapping object). Wire + /// stability — TS consumers parse as string. + #[test] + fn inference_request_id_serializes_transparent() { + let id = InferenceRequestId(Uuid::from_u128(42)); + let json = serde_json::to_string(&id).unwrap(); + // Just verify it's a bare string, not an object. + assert!(json.starts_with('"') && json.ends_with('"')); + assert!(!json.contains('{')); + } + + /// What this catches: CompositionPlan is transparent over a + /// UUID. Composer module replaces with the full shape later; + /// the wire stays a string. + #[test] + fn composition_plan_serializes_transparent() { + let plan = sample_composition(); + let json = serde_json::to_string(&plan).unwrap(); + assert!(json.starts_with('"') && json.ends_with('"')); + assert!(!json.contains('{')); + } + + /// What this catches: default SamplingParams match the llama.cpp + /// sensible baseline. If a future PR drifts a default, this test + /// flags it — that's a substrate-level generation behavior + /// change. + #[test] + fn default_sampling_matches_llama_cpp_baseline() { + let s = SamplingParams::default(); + assert!((s.temperature - 0.8).abs() < 1e-6); + assert!((s.top_p - 0.95).abs() < 1e-6); + assert_eq!(s.top_k, 40); + assert!((s.repeat_penalty - 1.1).abs() < 1e-6); + } + + /// What this catches: SamplingParams serializes with camelCase + /// fields (topP, topK, repeatPenalty). TS consumers parse the + /// camelCase form. + #[test] + fn sampling_params_serializes_camel_case() { + let s = SamplingParams::default(); + let j = serde_json::to_string(&s).unwrap(); + assert!(j.contains("\"temperature\":"), "got {j}"); + assert!(j.contains("\"topP\":"), "got {j}"); + assert!(j.contains("\"topK\":"), "got {j}"); + assert!(j.contains("\"repeatPenalty\":"), "got {j}"); + } + + /// What this catches: GenerationBudget serializes with + /// camelCase fields. The two zero-means-unlimited fields + /// (max_tokens + max_duration_ms) preserve their semantic + /// across the wire. + #[test] + fn generation_budget_serializes_camel_case() { + let b = GenerationBudget { + max_tokens: 100, + max_duration_ms: 5000, + }; + let j = serde_json::to_string(&b).unwrap(); + assert!(j.contains("\"maxTokens\":100"), "got {j}"); + assert!(j.contains("\"maxDurationMs\":5000"), "got {j}"); + } + + /// What this catches: FinishReason variants serialize with the + /// `kind` tag (camelCase). TS consumers narrow by it. Each + /// variant's payload preserved through serde round-trip. + #[test] + fn finish_reason_serializes_with_kind_tag() { + assert_eq!( + serde_json::to_string(&FinishReason::Stop).unwrap(), + "{\"kind\":\"stop\"}" + ); + assert_eq!( + serde_json::to_string(&FinishReason::MaxTokens).unwrap(), + "{\"kind\":\"maxTokens\"}" + ); + assert_eq!( + serde_json::to_string(&FinishReason::MaxDuration).unwrap(), + "{\"kind\":\"maxDuration\"}" + ); + + let stop_seq = FinishReason::StopSequence { + matched: "STOP".into(), + }; + let j = serde_json::to_string(&stop_seq).unwrap(); + assert!(j.contains("\"kind\":\"stopSequence\""), "got {j}"); + assert!(j.contains("\"matched\":\"STOP\""), "got {j}"); + + let err = FinishReason::Error { + reason: "context overflow".into(), + }; + let j = serde_json::to_string(&err).unwrap(); + assert!(j.contains("\"kind\":\"error\""), "got {j}"); + assert!(j.contains("\"reason\":\"context overflow\""), "got {j}"); + } + + /// What this catches: InferenceRequest round-trips through + /// serde with all fields intact. This is the contract every + /// producer-of-requests (persona-cognition) emits. + #[test] + fn inference_request_round_trips_through_serde() { + let req = InferenceRequest { + request_id: sample_request_id(), + persona: sample_persona(), + composition: sample_composition(), + prompt_tokens: vec![1, 2, 3, 4, 5], + budget: GenerationBudget { + max_tokens: 100, + max_duration_ms: 5000, + }, + sampling: SamplingParams::default(), + stop_sequences: vec!["STOP".into()], + }; + let json = serde_json::to_string(&req).unwrap(); + let back: InferenceRequest = serde_json::from_str(&json).unwrap(); + assert_eq!(req, back); + } + + /// What this catches: InferenceRequest serializes camelCase + /// field names. Wire stability for TS consumers. + #[test] + fn inference_request_field_names_are_camel_case() { + let req = InferenceRequest { + request_id: sample_request_id(), + persona: sample_persona(), + composition: sample_composition(), + prompt_tokens: vec![1], + budget: GenerationBudget { + max_tokens: 10, + max_duration_ms: 100, + }, + sampling: SamplingParams::default(), + stop_sequences: vec![], + }; + let j = serde_json::to_string(&req).unwrap(); + assert!(j.contains("\"requestId\":"), "got {j}"); + assert!(j.contains("\"promptTokens\":"), "got {j}"); + assert!(j.contains("\"stopSequences\":"), "got {j}"); + } + + /// What this catches: InferenceComplete round-trips. This is + /// the most-consumed event — sentinel-observer + VDD harness + + /// audit-recorder all read it. + #[test] + fn inference_complete_round_trips_through_serde() { + let c = InferenceComplete { + request_id: sample_request_id(), + persona: sample_persona(), + completion_tokens: vec![10, 11, 12], + finish_reason: FinishReason::MaxTokens, + elapsed_ms: 1234, + tokens_generated: 3, + }; + let json = serde_json::to_string(&c).unwrap(); + let back: InferenceComplete = serde_json::from_str(&json).unwrap(); + assert_eq!(c, back); + } + + /// What this catches: FirstTokenEmitted wire shape. TTFT is + /// the load-bearing latency signal; consumers (VDD harness) + /// will hammer this event. + #[test] + fn first_token_emitted_round_trips_and_uses_microseconds() { + let f = FirstTokenEmitted { + request_id: sample_request_id(), + persona: sample_persona(), + elapsed_us: 42_000, + }; + let json = serde_json::to_string(&f).unwrap(); + assert!(json.contains("\"elapsedUs\":42000"), "got {json}"); + let back: FirstTokenEmitted = serde_json::from_str(&json).unwrap(); + assert_eq!(f, back); + } + + /// What this catches: ResidencyFault carries the missing page + /// + reason. Sentinel-observer subscribes to learn which pages + /// to upgrade in tier policy. + #[test] + fn residency_fault_round_trips_with_missing_page_and_reason() { + let r = ResidencyFault { + request_id: sample_request_id(), + persona: sample_persona(), + missing_page: sample_page(), + reason: "page evicted mid-turn by Bench LFU policy".into(), + }; + let json = serde_json::to_string(&r).unwrap(); + assert!(json.contains("\"missingPage\":"), "got {json}"); + assert!(json.contains("\"reason\":"), "got {json}"); + let back: ResidencyFault = serde_json::from_str(&json).unwrap(); + assert_eq!(r, back); + } + + /// What this catches: an empty stop_sequences Vec serializes + /// as `[]`, not `null` or missing. Consumers (engine) walk the + /// Vec; treating empty as absent would silently behave like + /// "no stop sequence at all," which is correct, but the wire + /// shape must be consistent. + #[test] + fn empty_stop_sequences_serialize_as_empty_array() { + let req = InferenceRequest { + request_id: sample_request_id(), + persona: sample_persona(), + composition: sample_composition(), + prompt_tokens: vec![], + budget: GenerationBudget { + max_tokens: 0, + max_duration_ms: 0, + }, + sampling: SamplingParams::default(), + stop_sequences: vec![], + }; + let j = serde_json::to_string(&req).unwrap(); + assert!(j.contains("\"stopSequences\":[]"), "got {j}"); + } + + /// What this catches: all four event types use the same + /// InferenceRequestId field name (`requestId` on the wire) so + /// consumers can correlate across the four streams with a + /// single key extraction. Wire convention pin. + #[test] + fn all_four_events_use_same_request_id_field_name() { + let id = sample_request_id(); + let persona = sample_persona(); + + let req = InferenceRequest { + request_id: id, + persona, + composition: sample_composition(), + prompt_tokens: vec![], + budget: GenerationBudget { + max_tokens: 0, + max_duration_ms: 0, + }, + sampling: SamplingParams::default(), + stop_sequences: vec![], + }; + let complete = InferenceComplete { + request_id: id, + persona, + completion_tokens: vec![], + finish_reason: FinishReason::Stop, + elapsed_ms: 0, + tokens_generated: 0, + }; + let first = FirstTokenEmitted { + request_id: id, + persona, + elapsed_us: 0, + }; + let fault = ResidencyFault { + request_id: id, + persona, + missing_page: sample_page(), + reason: "test".into(), + }; + + for json in [ + serde_json::to_string(&req).unwrap(), + serde_json::to_string(&complete).unwrap(), + serde_json::to_string(&first).unwrap(), + serde_json::to_string(&fault).unwrap(), + ] { + assert!( + json.contains("\"requestId\":"), + "every event must use requestId for correlation; got {json}" + ); + } + } +} diff --git a/src/workers/continuum-core/src/inference/mod.rs b/src/workers/continuum-core/src/inference/mod.rs index 395a84e0f..2c3dcd950 100644 --- a/src/workers/continuum-core/src/inference/mod.rs +++ b/src/workers/continuum-core/src/inference/mod.rs @@ -33,6 +33,7 @@ pub mod backends; pub mod footprint_registry; pub mod kv_quant; pub mod llamacpp_adapter; +pub mod llm_module; pub mod lora; pub mod model; pub mod ort_providers;