CambrianTech · joelteply · May 18, 2026 · May 18, 2026
diff --git a/src/shared/generated/inference_llm/CompositionPlan.ts b/src/shared/generated/inference_llm/CompositionPlan.ts
@@ -0,0 +1,14 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+/**
+ * Opaque reference to a composition plan. The composer module
+ * (MODULE-CATALOG §II `composer`, not yet built) will own the
+ * full shape with LoRA stacking order + per-artifact weights +
+ * KV cache references. PR-1 ships a content-addressed reference
+ * so InferenceRequest compiles + downstream consumers can wire
+ * to it today.
+ *
+ * Wire form: a UUID string (artifact id of the composition plan
+ * blob). Transparent serde — TS consumers see a string.
+ */
+export type CompositionPlan = string;
diff --git a/src/shared/generated/inference_llm/FinishReason.ts b/src/shared/generated/inference_llm/FinishReason.ts
@@ -0,0 +1,18 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+/**
+ * Why generation stopped. Each variant carries the context the
+ * observability stack needs to debug:
+ *
+ * - `Stop` — the model emitted an EOS token (natural stop)
+ * - `MaxTokens` — hit `GenerationBudget.max_tokens`; caller may
+ *   want to retry with a higher budget
+ * - `MaxDuration` — hit `GenerationBudget.max_duration_ms`; caller
+ *   should re-budget or accept partial response
+ * - `StopSequence { matched }` — caller-provided stop sequence
+ *   matched the output. `matched` is the literal that fired.
+ * - `Error { reason }` — generation failed for a reason that
+ *   wasn't a budget exhaustion. Per Joel's never-swallow-errors:
+ *   error is typed, reason is loud.
+ */
+export type FinishReason = { "kind": "stop" } | { "kind": "maxTokens" } | { "kind": "maxDuration" } | { "kind": "stopSequence", matched: string, } | { "kind": "error", reason: string, };
diff --git a/src/shared/generated/inference_llm/FirstTokenEmitted.ts b/src/shared/generated/inference_llm/FirstTokenEmitted.ts
@@ -0,0 +1,24 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+import type { PersonaId } from "../genome/PersonaId";
+import type { InferenceRequestId } from "./InferenceRequestId";
+
+/**
+ * Emitted when the model produces its first token. Drives the
+ * time-to-first-token (TTFT) latency budget the VDD harness
+ * tracks per turn. Separate event from `InferenceComplete` so
+ * observability can wire "user sees something" telemetry without
+ * blocking on full generation.
+ *
+ * Engines that don't stream (atomic generate-then-emit) emit
+ * FirstTokenEmitted with `elapsed_us` equal to
+ * `InferenceComplete.elapsed_ms` times 1000 — the contract is
+ * "the first token left the engine at this timestamp," not
+ * "the engine generated the first token in isolation."
+ */
+export type FirstTokenEmitted = { requestId: InferenceRequestId, persona: PersonaId, 
+/**
+ * Microseconds from request receipt to first token emission.
+ * Microsecond precision because sub-ms TTFT is achievable on
+ * hot-path warm models.
+ */
+elapsedUs: number, };
diff --git a/src/shared/generated/inference_llm/GenerationBudget.ts b/src/shared/generated/inference_llm/GenerationBudget.ts
@@ -0,0 +1,21 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+/**
+ * Resource budget for a generation. Mirrors the spec's
+ * "InferenceRequest takes a budget" requirement; the inference
+ * engine honors both ceilings (whichever hits first stops
+ * generation).
+ */
+export type GenerationBudget = { 
+/**
+ * Maximum tokens to generate before stopping with
+ * FinishReason::MaxTokens. 0 = unlimited (caller takes
+ * duration responsibility).
+ */
+maxTokens: number, 
+/**
+ * Wall-clock deadline in milliseconds from request receipt.
+ * 0 = no time limit. When the limit hits first the engine
+ * stops with FinishReason::MaxDuration.
+ */
+maxDurationMs: number, };
diff --git a/src/shared/generated/inference_llm/InferenceComplete.ts b/src/shared/generated/inference_llm/InferenceComplete.ts
@@ -0,0 +1,34 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+import type { PersonaId } from "../genome/PersonaId";
+import type { FinishReason } from "./FinishReason";
+import type { InferenceRequestId } from "./InferenceRequestId";
+
+/**
+ * Emitted when generation completes (any FinishReason). Carries
+ * the full response + timing for observability + sentinel
+ * attribution.
+ */
+export type InferenceComplete = { requestId: InferenceRequestId, persona: PersonaId, 
+/**
+ * Tokens emitted by the model. Raw-token engines populate
+ * directly; adapter-based engines (PR-4) populate empty Vec
+ * + the actual output goes in `completion_text` because the
+ * adapter doesn't expose token-level output.
+ */
+completionTokens: Array<number>, 
+/**
+ * PR-4 addition: plain-text completion from adapter-based
+ * engines (LlamaCppAdapter). `None` = raw-token path; the
+ * caller decodes `completion_tokens` if it needs text.
+ */
+completionText?: string, finishReason: FinishReason, 
+/**
+ * Wall-clock duration from request receipt to last token.
+ */
+elapsedMs: number, 
+/**
+ * Number of tokens generated. Equals `completion_tokens.len()`
+ * for raw-token engines; adapter-based engines populate from
+ * the adapter's UsageMetrics.completion_tokens count.
+ */
+tokensGenerated: number, };
diff --git a/src/shared/generated/inference_llm/InferenceRequest.ts b/src/shared/generated/inference_llm/InferenceRequest.ts
@@ -0,0 +1,38 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+import type { PersonaId } from "../genome/PersonaId";
+import type { CompositionPlan } from "./CompositionPlan";
+import type { GenerationBudget } from "./GenerationBudget";
+import type { InferenceRequestId } from "./InferenceRequestId";
+import type { SamplingParams } from "./SamplingParams";
+
+/**
+ * The `[InferenceRequest]` subscription event. Persona-cognition
+ * emits one per turn; the inference-llm module subscribes + runs
+ * the generation. Producers populate `request_id` with a fresh
+ * Uuid; the engine echoes it in the response events for
+ * correlation.
+ */
+export type InferenceRequest = { requestId: InferenceRequestId, persona: PersonaId, composition: CompositionPlan, 
+/**
+ * Tokenized prompt for raw-token engines. PR-1 ships this as
+ * the canonical input; PR-4 adds `prompt_text` for adapter-
+ * based engines (LlamaCppAdapter) that tokenize internally.
+ * At least one of (prompt_tokens, prompt_text) must be
+ * non-empty; the engine chooses based on its capability.
+ */
+promptTokens: Array<number>, 
+/**
+ * PR-4 addition: plain-text prompt for engines that tokenize
+ * internally (AIProviderAdapter-backed paths like
+ * LlamaCppAdapter). `None` = caller is using the
+ * prompt_tokens path. When set, adapter-based engines wrap
+ * it as a single user-role `ChatMessage` before calling
+ * `generate_text`.
+ */
+promptText?: string, budget: GenerationBudget, sampling: SamplingParams, 
+/**
+ * Optional caller-provided stop sequences. Generation halts
+ * with FinishReason::StopSequence on first match. Empty Vec
+ * = no caller stop sequences (only EOS + budget halt).
+ */
+stopSequences: Array<string>, };
diff --git a/src/shared/generated/inference_llm/InferenceRequestId.ts b/src/shared/generated/inference_llm/InferenceRequestId.ts
@@ -0,0 +1,10 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+/**
+ * Typed identifier for one InferenceRequest. The four events
+ * (Request / Complete / FirstToken / ResidencyFault) all carry
+ * the same `InferenceRequestId` so consumers can correlate them.
+ * Generated by the producer (typically persona-cognition); the
+ * inference engine echoes it through the response events.
+ */
+export type InferenceRequestId = string;
diff --git a/src/shared/generated/inference_llm/ResidencyFault.ts b/src/shared/generated/inference_llm/ResidencyFault.ts
@@ -0,0 +1,24 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+import type { PageRef } from "../genome/PageRef";
+import type { PersonaId } from "../genome/PersonaId";
+import type { InferenceRequestId } from "./InferenceRequestId";
+
+/**
+ * Emitted when inference would have needed a page that isn't
+ * resident in the persona's working set. The engine refuses
+ * (per the no-CPU-fallback contract from #1341) rather than
+ * silently demoting; sentinel learns from these to upgrade the
+ * missing page's tier policy.
+ *
+ * The page reference identifies the missing artifact. Reason
+ * explains why it wasn't resident (cold miss / evicted mid-turn
+ * / never imported by foundry).
+ */
+export type ResidencyFault = { requestId: InferenceRequestId, persona: PersonaId, missingPage: PageRef, 
+/**
+ * Loud reason per Joel's never-swallow-errors rule. Examples:
+ * "page evicted mid-turn by Bench LFU policy", "foundry
+ * never imported MoE expert 3 of artifact X", "KV cache
+ * chunk 4 not in working set."
+ */
+reason: string, };
diff --git a/src/shared/generated/inference_llm/SamplingParams.ts b/src/shared/generated/inference_llm/SamplingParams.ts
@@ -0,0 +1,28 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+/**
+ * Sampling parameters for the LLM generation. The defaults match
+ * llama.cpp's sensible-baseline values for chat-style generation;
+ * caller overrides per-request.
+ */
+export type SamplingParams = { 
+/**
+ * Sampling temperature. 0.0 = greedy; 1.0 = neutral; > 1.0 =
+ * more diverse. Llama.cpp default 0.8.
+ */
+temperature: number, 
+/**
+ * Nucleus sampling cutoff. Keep tokens whose cumulative
+ * probability ≥ top_p. 1.0 disables. Llama.cpp default 0.95.
+ */
+topP: number, 
+/**
+ * Top-K sampling cutoff. Keep only top K candidates; 0 = all.
+ * Llama.cpp default 40.
+ */
+topK: number, 
+/**
+ * Repeat penalty. >1.0 penalizes repeated tokens. Llama.cpp
+ * default 1.1.
+ */
+repeatPenalty: number, };
diff --git a/src/shared/generated/inference_llm/index.ts b/src/shared/generated/inference_llm/index.ts
@@ -0,0 +1,13 @@
+// Auto-generated barrel export — do not edit manually
+// Source: generator/generate-rust-bindings.ts
+// Re-generate: npx tsx generator/generate-rust-bindings.ts
+
+export type { CompositionPlan } from './CompositionPlan';
+export type { FinishReason } from './FinishReason';
+export type { FirstTokenEmitted } from './FirstTokenEmitted';
+export type { GenerationBudget } from './GenerationBudget';
+export type { InferenceComplete } from './InferenceComplete';
+export type { InferenceRequest } from './InferenceRequest';
+export type { InferenceRequestId } from './InferenceRequestId';
+export type { ResidencyFault } from './ResidencyFault';
+export type { SamplingParams } from './SamplingParams';
diff --git a/src/workers/continuum-core/src/inference/llm_module.rs b/src/workers/continuum-core/src/inference/llm_module.rs
@@ -205,12 +205,22 @@ pub struct InferenceRequest {
     pub request_id: InferenceRequestId,
     pub persona: PersonaId,
     pub composition: CompositionPlan,
-    /// Tokenized prompt. PR-1 carries the token ids; PR-3's
-    /// inference engine consumes them directly. The tokenizer
-    /// lives in persona-cognition or a separate tokenizer module
-    /// (PR-3 decides).
+    /// Tokenized prompt for raw-token engines. PR-1 ships this as
+    /// the canonical input; PR-4 adds `prompt_text` for adapter-
+    /// based engines (LlamaCppAdapter) that tokenize internally.
+    /// At least one of (prompt_tokens, prompt_text) must be
+    /// non-empty; the engine chooses based on its capability.
     #[ts(type = "Array<number>")]
     pub prompt_tokens: Vec<u32>,
+    /// PR-4 addition: plain-text prompt for engines that tokenize
+    /// internally (AIProviderAdapter-backed paths like
+    /// LlamaCppAdapter). `None` = caller is using the
+    /// prompt_tokens path. When set, adapter-based engines wrap
+    /// it as a single user-role `ChatMessage` before calling
+    /// `generate_text`.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    #[ts(optional)]
+    pub prompt_text: Option<String>,
     pub budget: GenerationBudget,
     pub sampling: SamplingParams,
     /// Optional caller-provided stop sequences. Generation halts
@@ -231,17 +241,25 @@ pub struct InferenceRequest {
 pub struct InferenceComplete {
     pub request_id: InferenceRequestId,
     pub persona: PersonaId,
-    /// Tokens emitted by the model. Caller (persona-cognition)
-    /// detokenizes if it needs the string form.
+    /// Tokens emitted by the model. Raw-token engines populate
+    /// directly; adapter-based engines (PR-4) populate empty Vec
+    /// + the actual output goes in `completion_text` because the
+    /// adapter doesn't expose token-level output.
     #[ts(type = "Array<number>")]
     pub completion_tokens: Vec<u32>,
+    /// PR-4 addition: plain-text completion from adapter-based
+    /// engines (LlamaCppAdapter). `None` = raw-token path; the
+    /// caller decodes `completion_tokens` if it needs text.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    #[ts(optional)]
+    pub completion_text: Option<String>,
     pub finish_reason: FinishReason,
     /// Wall-clock duration from request receipt to last token.
     #[ts(type = "number")]
     pub elapsed_ms: u64,
     /// Number of tokens generated. Equals `completion_tokens.len()`
-    /// but stored as a field so consumers don't have to deserialize
-    /// the full Vec to know the count.
+    /// for raw-token engines; adapter-based engines populate from
+    /// the adapter's UsageMetrics.completion_tokens count.
     #[ts(type = "number")]
     pub tokens_generated: u32,
 }
@@ -430,6 +448,7 @@ mod tests {
             persona: sample_persona(),
             composition: sample_composition(),
             prompt_tokens: vec![1, 2, 3, 4, 5],
+            prompt_text: None,
             budget: GenerationBudget {
                 max_tokens: 100,
                 max_duration_ms: 5000,
@@ -451,6 +470,7 @@ mod tests {
             persona: sample_persona(),
             composition: sample_composition(),
             prompt_tokens: vec![1],
+            prompt_text: None,
             budget: GenerationBudget {
                 max_tokens: 10,
                 max_duration_ms: 100,
@@ -473,6 +493,7 @@ mod tests {
             request_id: sample_request_id(),
             persona: sample_persona(),
             completion_tokens: vec![10, 11, 12],
+            completion_text: None,
             finish_reason: FinishReason::MaxTokens,
             elapsed_ms: 1234,
             tokens_generated: 3,
@@ -528,6 +549,7 @@ mod tests {
             persona: sample_persona(),
             composition: sample_composition(),
             prompt_tokens: vec![],
+            prompt_text: None,
             budget: GenerationBudget {
                 max_tokens: 0,
                 max_duration_ms: 0,
@@ -553,6 +575,7 @@ mod tests {
             persona,
             composition: sample_composition(),
             prompt_tokens: vec![],
+            prompt_text: None,
             budget: GenerationBudget {
                 max_tokens: 0,
                 max_duration_ms: 0,
@@ -564,6 +587,7 @@ mod tests {
             request_id: id,
             persona,
             completion_tokens: vec![],
+            completion_text: None,
             finish_reason: FinishReason::Stop,
             elapsed_ms: 0,
             tokens_generated: 0,

diff --git a/src/workers/continuum-core/src/inference/llm_module_bus.rs b/src/workers/continuum-core/src/inference/llm_module_bus.rs
@@ -279,6 +279,7 @@ mod tests {
             persona: sample_persona(),
             composition: CompositionPlan(ArtifactId::new(Uuid::from_u128(100))),
             prompt_tokens: vec![1, 2, 3],
+            prompt_text: None,
             budget: GenerationBudget {
                 max_tokens: 100,
                 max_duration_ms: 5000,
@@ -292,6 +293,7 @@ mod tests {
             request_id: sample_request_id(),
             persona: sample_persona(),
             completion_tokens: vec![10, 11],
+            completion_text: None,
             finish_reason: FinishReason::Stop,
             elapsed_ms: 100,
             tokens_generated: 2,