Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/shared/generated/inference_llm/CompositionPlan.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.

/**
* Opaque reference to a composition plan. The composer module
* (MODULE-CATALOG §II `composer`, not yet built) will own the
* full shape with LoRA stacking order + per-artifact weights +
* KV cache references. PR-1 ships a content-addressed reference
* so InferenceRequest compiles + downstream consumers can wire
* to it today.
*
* Wire form: a UUID string (artifact id of the composition plan
* blob). Transparent serde — TS consumers see a string.
*/
export type CompositionPlan = string;
18 changes: 18 additions & 0 deletions src/shared/generated/inference_llm/FinishReason.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.

/**
* Why generation stopped. Each variant carries the context the
* observability stack needs to debug:
*
* - `Stop` — the model emitted an EOS token (natural stop)
* - `MaxTokens` — hit `GenerationBudget.max_tokens`; caller may
* want to retry with a higher budget
* - `MaxDuration` — hit `GenerationBudget.max_duration_ms`; caller
* should re-budget or accept partial response
* - `StopSequence { matched }` — caller-provided stop sequence
* matched the output. `matched` is the literal that fired.
* - `Error { reason }` — generation failed for a reason that
* wasn't a budget exhaustion. Per Joel's never-swallow-errors:
* error is typed, reason is loud.
*/
export type FinishReason = { "kind": "stop" } | { "kind": "maxTokens" } | { "kind": "maxDuration" } | { "kind": "stopSequence", matched: string, } | { "kind": "error", reason: string, };
24 changes: 24 additions & 0 deletions src/shared/generated/inference_llm/FirstTokenEmitted.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
import type { PersonaId } from "../genome/PersonaId";
import type { InferenceRequestId } from "./InferenceRequestId";

/**
* Emitted when the model produces its first token. Drives the
* time-to-first-token (TTFT) latency budget the VDD harness
* tracks per turn. Separate event from `InferenceComplete` so
* observability can wire "user sees something" telemetry without
* blocking on full generation.
*
* Engines that don't stream (atomic generate-then-emit) emit
* FirstTokenEmitted with `elapsed_us` equal to
* `InferenceComplete.elapsed_ms` times 1000 — the contract is
* "the first token left the engine at this timestamp," not
* "the engine generated the first token in isolation."
*/
export type FirstTokenEmitted = { requestId: InferenceRequestId, persona: PersonaId,
/**
* Microseconds from request receipt to first token emission.
* Microsecond precision because sub-ms TTFT is achievable on
* hot-path warm models.
*/
elapsedUs: number, };
21 changes: 21 additions & 0 deletions src/shared/generated/inference_llm/GenerationBudget.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.

/**
* Resource budget for a generation. Mirrors the spec's
* "InferenceRequest takes a budget" requirement; the inference
* engine honors both ceilings (whichever hits first stops
* generation).
*/
export type GenerationBudget = {
/**
* Maximum tokens to generate before stopping with
* FinishReason::MaxTokens. 0 = unlimited (caller takes
* duration responsibility).
*/
maxTokens: number,
/**
* Wall-clock deadline in milliseconds from request receipt.
* 0 = no time limit. When the limit hits first the engine
* stops with FinishReason::MaxDuration.
*/
maxDurationMs: number, };
34 changes: 34 additions & 0 deletions src/shared/generated/inference_llm/InferenceComplete.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
import type { PersonaId } from "../genome/PersonaId";
import type { FinishReason } from "./FinishReason";
import type { InferenceRequestId } from "./InferenceRequestId";

/**
* Emitted when generation completes (any FinishReason). Carries
* the full response + timing for observability + sentinel
* attribution.
*/
export type InferenceComplete = { requestId: InferenceRequestId, persona: PersonaId,
/**
* Tokens emitted by the model. Raw-token engines populate
* directly; adapter-based engines (PR-4) populate empty Vec
* + the actual output goes in `completion_text` because the
* adapter doesn't expose token-level output.
*/
completionTokens: Array<number>,
/**
* PR-4 addition: plain-text completion from adapter-based
* engines (LlamaCppAdapter). `None` = raw-token path; the
* caller decodes `completion_tokens` if it needs text.
*/
completionText?: string, finishReason: FinishReason,
/**
* Wall-clock duration from request receipt to last token.
*/
elapsedMs: number,
/**
* Number of tokens generated. Equals `completion_tokens.len()`
* for raw-token engines; adapter-based engines populate from
* the adapter's UsageMetrics.completion_tokens count.
*/
tokensGenerated: number, };
38 changes: 38 additions & 0 deletions src/shared/generated/inference_llm/InferenceRequest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
import type { PersonaId } from "../genome/PersonaId";
import type { CompositionPlan } from "./CompositionPlan";
import type { GenerationBudget } from "./GenerationBudget";
import type { InferenceRequestId } from "./InferenceRequestId";
import type { SamplingParams } from "./SamplingParams";

/**
* The `[InferenceRequest]` subscription event. Persona-cognition
* emits one per turn; the inference-llm module subscribes + runs
* the generation. Producers populate `request_id` with a fresh
* Uuid; the engine echoes it in the response events for
* correlation.
*/
export type InferenceRequest = { requestId: InferenceRequestId, persona: PersonaId, composition: CompositionPlan,
/**
* Tokenized prompt for raw-token engines. PR-1 ships this as
* the canonical input; PR-4 adds `prompt_text` for adapter-
* based engines (LlamaCppAdapter) that tokenize internally.
* At least one of (prompt_tokens, prompt_text) must be
* non-empty; the engine chooses based on its capability.
*/
promptTokens: Array<number>,
/**
* PR-4 addition: plain-text prompt for engines that tokenize
* internally (AIProviderAdapter-backed paths like
* LlamaCppAdapter). `None` = caller is using the
* prompt_tokens path. When set, adapter-based engines wrap
* it as a single user-role `ChatMessage` before calling
* `generate_text`.
*/
promptText?: string, budget: GenerationBudget, sampling: SamplingParams,
/**
* Optional caller-provided stop sequences. Generation halts
* with FinishReason::StopSequence on first match. Empty Vec
* = no caller stop sequences (only EOS + budget halt).
*/
stopSequences: Array<string>, };
10 changes: 10 additions & 0 deletions src/shared/generated/inference_llm/InferenceRequestId.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.

/**
* Typed identifier for one InferenceRequest. The four events
* (Request / Complete / FirstToken / ResidencyFault) all carry
* the same `InferenceRequestId` so consumers can correlate them.
* Generated by the producer (typically persona-cognition); the
* inference engine echoes it through the response events.
*/
export type InferenceRequestId = string;
24 changes: 24 additions & 0 deletions src/shared/generated/inference_llm/ResidencyFault.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
import type { PageRef } from "../genome/PageRef";
import type { PersonaId } from "../genome/PersonaId";
import type { InferenceRequestId } from "./InferenceRequestId";

/**
* Emitted when inference would have needed a page that isn't
* resident in the persona's working set. The engine refuses
* (per the no-CPU-fallback contract from #1341) rather than
* silently demoting; sentinel learns from these to upgrade the
* missing page's tier policy.
*
* The page reference identifies the missing artifact. Reason
* explains why it wasn't resident (cold miss / evicted mid-turn
* / never imported by foundry).
*/
export type ResidencyFault = { requestId: InferenceRequestId, persona: PersonaId, missingPage: PageRef,
/**
* Loud reason per Joel's never-swallow-errors rule. Examples:
* "page evicted mid-turn by Bench LFU policy", "foundry
* never imported MoE expert 3 of artifact X", "KV cache
* chunk 4 not in working set."
*/
reason: string, };
28 changes: 28 additions & 0 deletions src/shared/generated/inference_llm/SamplingParams.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.

/**
* Sampling parameters for the LLM generation. The defaults match
* llama.cpp's sensible-baseline values for chat-style generation;
* caller overrides per-request.
*/
export type SamplingParams = {
/**
* Sampling temperature. 0.0 = greedy; 1.0 = neutral; > 1.0 =
* more diverse. Llama.cpp default 0.8.
*/
temperature: number,
/**
* Nucleus sampling cutoff. Keep tokens whose cumulative
* probability ≥ top_p. 1.0 disables. Llama.cpp default 0.95.
*/
topP: number,
/**
* Top-K sampling cutoff. Keep only top K candidates; 0 = all.
* Llama.cpp default 40.
*/
topK: number,
/**
* Repeat penalty. >1.0 penalizes repeated tokens. Llama.cpp
* default 1.1.
*/
repeatPenalty: number, };
13 changes: 13 additions & 0 deletions src/shared/generated/inference_llm/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Auto-generated barrel export — do not edit manually
// Source: generator/generate-rust-bindings.ts
// Re-generate: npx tsx generator/generate-rust-bindings.ts

export type { CompositionPlan } from './CompositionPlan';
export type { FinishReason } from './FinishReason';
export type { FirstTokenEmitted } from './FirstTokenEmitted';
export type { GenerationBudget } from './GenerationBudget';
export type { InferenceComplete } from './InferenceComplete';
export type { InferenceRequest } from './InferenceRequest';
export type { InferenceRequestId } from './InferenceRequestId';
export type { ResidencyFault } from './ResidencyFault';
export type { SamplingParams } from './SamplingParams';
40 changes: 32 additions & 8 deletions src/workers/continuum-core/src/inference/llm_module.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,22 @@ pub struct InferenceRequest {
pub request_id: InferenceRequestId,
pub persona: PersonaId,
pub composition: CompositionPlan,
/// Tokenized prompt. PR-1 carries the token ids; PR-3's
/// inference engine consumes them directly. The tokenizer
/// lives in persona-cognition or a separate tokenizer module
/// (PR-3 decides).
/// Tokenized prompt for raw-token engines. PR-1 ships this as
/// the canonical input; PR-4 adds `prompt_text` for adapter-
/// based engines (LlamaCppAdapter) that tokenize internally.
/// At least one of (prompt_tokens, prompt_text) must be
/// non-empty; the engine chooses based on its capability.
#[ts(type = "Array<number>")]
pub prompt_tokens: Vec<u32>,
/// PR-4 addition: plain-text prompt for engines that tokenize
/// internally (AIProviderAdapter-backed paths like
/// LlamaCppAdapter). `None` = caller is using the
/// prompt_tokens path. When set, adapter-based engines wrap
/// it as a single user-role `ChatMessage` before calling
/// `generate_text`.
#[serde(default, skip_serializing_if = "Option::is_none")]
#[ts(optional)]
pub prompt_text: Option<String>,
pub budget: GenerationBudget,
pub sampling: SamplingParams,
/// Optional caller-provided stop sequences. Generation halts
Expand All @@ -231,17 +241,25 @@ pub struct InferenceRequest {
pub struct InferenceComplete {
pub request_id: InferenceRequestId,
pub persona: PersonaId,
/// Tokens emitted by the model. Caller (persona-cognition)
/// detokenizes if it needs the string form.
/// Tokens emitted by the model. Raw-token engines populate
/// directly; adapter-based engines (PR-4) populate empty Vec
/// + the actual output goes in `completion_text` because the
/// adapter doesn't expose token-level output.
#[ts(type = "Array<number>")]
pub completion_tokens: Vec<u32>,
/// PR-4 addition: plain-text completion from adapter-based
/// engines (LlamaCppAdapter). `None` = raw-token path; the
/// caller decodes `completion_tokens` if it needs text.
#[serde(default, skip_serializing_if = "Option::is_none")]
#[ts(optional)]
pub completion_text: Option<String>,
pub finish_reason: FinishReason,
/// Wall-clock duration from request receipt to last token.
#[ts(type = "number")]
pub elapsed_ms: u64,
/// Number of tokens generated. Equals `completion_tokens.len()`
/// but stored as a field so consumers don't have to deserialize
/// the full Vec to know the count.
/// for raw-token engines; adapter-based engines populate from
/// the adapter's UsageMetrics.completion_tokens count.
#[ts(type = "number")]
pub tokens_generated: u32,
}
Expand Down Expand Up @@ -430,6 +448,7 @@ mod tests {
persona: sample_persona(),
composition: sample_composition(),
prompt_tokens: vec![1, 2, 3, 4, 5],
prompt_text: None,
budget: GenerationBudget {
max_tokens: 100,
max_duration_ms: 5000,
Expand All @@ -451,6 +470,7 @@ mod tests {
persona: sample_persona(),
composition: sample_composition(),
prompt_tokens: vec![1],
prompt_text: None,
budget: GenerationBudget {
max_tokens: 10,
max_duration_ms: 100,
Expand All @@ -473,6 +493,7 @@ mod tests {
request_id: sample_request_id(),
persona: sample_persona(),
completion_tokens: vec![10, 11, 12],
completion_text: None,
finish_reason: FinishReason::MaxTokens,
elapsed_ms: 1234,
tokens_generated: 3,
Expand Down Expand Up @@ -528,6 +549,7 @@ mod tests {
persona: sample_persona(),
composition: sample_composition(),
prompt_tokens: vec![],
prompt_text: None,
budget: GenerationBudget {
max_tokens: 0,
max_duration_ms: 0,
Expand All @@ -553,6 +575,7 @@ mod tests {
persona,
composition: sample_composition(),
prompt_tokens: vec![],
prompt_text: None,
budget: GenerationBudget {
max_tokens: 0,
max_duration_ms: 0,
Expand All @@ -564,6 +587,7 @@ mod tests {
request_id: id,
persona,
completion_tokens: vec![],
completion_text: None,
finish_reason: FinishReason::Stop,
elapsed_ms: 0,
tokens_generated: 0,
Expand Down
2 changes: 2 additions & 0 deletions src/workers/continuum-core/src/inference/llm_module_bus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ mod tests {
persona: sample_persona(),
composition: CompositionPlan(ArtifactId::new(Uuid::from_u128(100))),
prompt_tokens: vec![1, 2, 3],
prompt_text: None,
budget: GenerationBudget {
max_tokens: 100,
max_duration_ms: 5000,
Expand All @@ -292,6 +293,7 @@ mod tests {
request_id: sample_request_id(),
persona: sample_persona(),
completion_tokens: vec![10, 11],
completion_text: None,
finish_reason: FinishReason::Stop,
elapsed_ms: 100,
tokens_generated: 2,
Expand Down
Loading
Loading