From f92307452eb1e27b7ed506a96458c9722cb9981b Mon Sep 17 00:00:00 2001 From: Test Date: Thu, 7 May 2026 22:03:47 -0500 Subject: [PATCH 1/3] Add Rust model resolver with hardware capability tiers (Lane C) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-D from docs/architecture/ALPHA-GAP-RUST-PERSONA-RUNTIME.md: capability-shaped model resolution with no-fallback contract. Builds on the typed model_registry SSOT (models.toml + providers.toml + Arch/Capability vocab) and the TargetSilicon 2-axis from #1062 (and dropped_no_budget loud-fail from #1063). cognition/model_resolver.rs (pure module — no IPC, no ORM, no inference): - ModelRequirement: required_capabilities, arch_preference, context_window_min, memory_budget_mb, provider_policy, host - ResolvedModel: model_id, provider_id, expected_memory_mb, target_silicon, hw_capability_tier, reason - HwCapabilityTier: finer-grained than TargetSilicon (M1Uma8Gb..M3UmaProMax, Sm70..Sm120, VulkanAmd, Cloud) - LocalOrCloudPolicy: LocalOnly | CloudOnly | PreferLocal | PreferCloud | Any - HostCapability: per-machine snapshot (tier + memory + primary silicon) - ResolutionError: NoModelMatchesRequirement{registry_count, candidates_after_filter, unmet_filters} — typed, no fallback - resolve_model(): pure function over IntoIterator<&Model> target_silicon derivation: local providers (llamacpp-local, docker-model-runner) inherit host.primary_target_silicon; cloud providers always TargetSilicon::Cloud. Hardcoded local-provider list for v1; follow-up moves it to a kind: local|cloud field on Provider in providers.toml. expected_memory_mb stays None until Model schema gains an estimated_memory_mb field — separate followup. Today's resolver still rejects cloud models from LocalOnly queries, which prevents the worst class of mis-routing. model_registry/types.rs: Arch gains #[derive(TS)] + ts(export) parallel to the existing Capability derivation. Backwards-compatible additive change; required because ModelRequirement.arch_preference: Vec crosses the TS boundary. 11 logic tests + 6 ts-rs export-binding tests = 16/16 green: - local_chat_resolves_to_qwen35_on_m1 - vision_request_resolves_to_qwen2_vl - cloud_only_skips_local_models - missing_capability_errors_no_fallback (NO FALLBACK assertion) - vision_with_local_only_on_cpu_host_still_finds_local_vision_model - context_window_min_filters_small_models - arch_preference_filters_to_qwen35_only - prefer_local_ranks_local_first - prefer_cloud_ranks_cloud_first - five_persona_resolution_smoke (Lane C contract test) Validation: - cargo test --features metal,accelerate cognition::model_resolver: 16/16 - npx tsx scripts/build-with-loud-failure.ts: TypeScript compilation succeeded Two SSOTs noted (TOML registry vs shared/models.json) — out of Lane C scope, filed for separate consolidation followup. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../generated/cognition/HostCapability.ts | 23 + .../generated/cognition/HwCapabilityTier.ts | 15 + .../generated/cognition/LocalOrCloudPolicy.ts | 6 + .../generated/cognition/ModelRequirement.ts | 40 + .../generated/cognition/ResolutionError.ts | 12 + .../generated/cognition/ResolvedModel.ts | 26 + src/shared/generated/cognition/index.ts | 15 + src/shared/generated/model_registry/Arch.ts | 12 + src/shared/generated/model_registry/index.ts | 1 + .../continuum-core/src/cognition/mod.rs | 2 + .../src/cognition/model_resolver.rs | 718 ++++++++++++++++++ .../src/model_registry/types.rs | 8 +- 12 files changed, 877 insertions(+), 1 deletion(-) create mode 100644 src/shared/generated/cognition/HostCapability.ts create mode 100644 src/shared/generated/cognition/HwCapabilityTier.ts create mode 100644 src/shared/generated/cognition/LocalOrCloudPolicy.ts create mode 100644 src/shared/generated/cognition/ModelRequirement.ts create mode 100644 src/shared/generated/cognition/ResolutionError.ts create mode 100644 src/shared/generated/cognition/ResolvedModel.ts create mode 100644 src/shared/generated/model_registry/Arch.ts create mode 100644 src/workers/continuum-core/src/cognition/model_resolver.rs diff --git a/src/shared/generated/cognition/HostCapability.ts b/src/shared/generated/cognition/HostCapability.ts new file mode 100644 index 000000000..6cdf6a163 --- /dev/null +++ b/src/shared/generated/cognition/HostCapability.ts @@ -0,0 +1,23 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { HwCapabilityTier } from "./HwCapabilityTier"; +import type { TargetSilicon } from "./TargetSilicon"; + +/** + * What the resolver knows about THIS machine. Caller populates from a + * hardware-detection probe at boot (see future `device_probe` module). + * The resolver consumes this as a snapshot — re-invoke when probe values + * change. + */ +export type HostCapability = { hwCapabilityTier: HwCapabilityTier, +/** + * Memory available for inference workloads in megabytes. For unified- + * memory hosts this is the share inference is willing to claim, not + * total system RAM. + */ +availableMemoryMb: number, +/** + * Which physical-budget pool inference workloads on this host should + * admit against. Mac M-series → `UnifiedMemory`; nVidia → `Gpu`; + * CPU-only → `Cpu`. + */ +primaryTargetSilicon: TargetSilicon, }; diff --git a/src/shared/generated/cognition/HwCapabilityTier.ts b/src/shared/generated/cognition/HwCapabilityTier.ts new file mode 100644 index 000000000..2f239ec18 --- /dev/null +++ b/src/shared/generated/cognition/HwCapabilityTier.ts @@ -0,0 +1,15 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * Finer-grained hardware tier than [`TargetSilicon`]. Selects which model + * VARIANT a host can run, not which physical-budget POOL admission uses. + * + * Example: `M1Uma8Gb` and `M3UmaProMax` both have + * `target_silicon == TargetSilicon::UnifiedMemory`, but only the latter + * can hold a 4B-parameter model alongside a 7B vision model. + * + * Lane B's lease layer + adaptive_throughput's budgets care about the + * pool (TargetSilicon). Lane C's resolver cares about the variant + * (HwCapabilityTier). + */ +export type HwCapabilityTier = "cpu_only" | "m1_uma8_gb" | "m1_uma16_gb" | "m2_uma_pro_max" | "m3_uma_pro_max" | "sm70" | "sm80" | "sm86" | "sm89" | "sm90" | "sm120" | "vulkan_amd" | "cloud"; diff --git a/src/shared/generated/cognition/LocalOrCloudPolicy.ts b/src/shared/generated/cognition/LocalOrCloudPolicy.ts new file mode 100644 index 000000000..5e643cc06 --- /dev/null +++ b/src/shared/generated/cognition/LocalOrCloudPolicy.ts @@ -0,0 +1,6 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * How aggressively to prefer local vs cloud providers. + */ +export type LocalOrCloudPolicy = "local_only" | "cloud_only" | "prefer_local" | "prefer_cloud" | "any"; diff --git a/src/shared/generated/cognition/ModelRequirement.ts b/src/shared/generated/cognition/ModelRequirement.ts new file mode 100644 index 000000000..95c4e8de9 --- /dev/null +++ b/src/shared/generated/cognition/ModelRequirement.ts @@ -0,0 +1,40 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { Arch } from "../model_registry/Arch"; +import type { Capability } from "../model_registry/Capability"; +import type { HostCapability } from "./HostCapability"; +import type { LocalOrCloudPolicy } from "./LocalOrCloudPolicy"; + +/** + * Capability-shaped query for the resolver. Callers describe what the + * model needs to DO (generate text, see images, etc.) — not which model + * to use. Per Joel's axiom: code knows ARCHETYPES, models are data. + */ +export type ModelRequirement = { +/** + * Capabilities every candidate must advertise. Empty set matches any + * model (rare — usually callers want at least `Chat`). + */ +requiredCapabilities: Array, +/** + * Architectural family preference. Empty = any architecture qualifies. + * When non-empty, candidates outside the preference are filtered out + * rather than down-ranked — caller wants this family or none. + */ +archPreference: Array, +/** + * Minimum context window in tokens. `0` = any. + */ +contextWindowMin: number, +/** + * Maximum memory the resolved model may consume on this host, in MB. + * `None` = use `host.available_memory_mb` as the implicit cap. + */ +memoryBudgetMb?: number, +/** + * Local-vs-cloud preference. See [`LocalOrCloudPolicy`]. + */ +providerPolicy: LocalOrCloudPolicy, +/** + * Host capability snapshot. See [`HostCapability`]. + */ +host: HostCapability, }; diff --git a/src/shared/generated/cognition/ResolutionError.ts b/src/shared/generated/cognition/ResolutionError.ts new file mode 100644 index 000000000..23cfbf2e1 --- /dev/null +++ b/src/shared/generated/cognition/ResolutionError.ts @@ -0,0 +1,12 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * Why a [`resolve_model`] call failed. Each variant names the SPECIFIC + * filter that eliminated all candidates so the caller's error message + * can be actionable. + * + * No `Fallback` variant. Per Joel's rule: missing-model is an error, not + * a soft retry on a default. Callers that want graceful degradation must + * EXPLICITLY relax their requirement and re-invoke. + */ +export type ResolutionError = { "kind": "noModelMatchesRequirement", registry_count: number, candidates_after_filter: number, unmet_filters: Array, }; diff --git a/src/shared/generated/cognition/ResolvedModel.ts b/src/shared/generated/cognition/ResolvedModel.ts new file mode 100644 index 000000000..abc3635b6 --- /dev/null +++ b/src/shared/generated/cognition/ResolvedModel.ts @@ -0,0 +1,26 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. +import type { HwCapabilityTier } from "./HwCapabilityTier"; +import type { TargetSilicon } from "./TargetSilicon"; + +/** + * Resolver output. Includes the silicon target so the caller can plumb it + * straight into a [`ThroughputJob`] without re-deriving it from the + * model + host. + */ +export type ResolvedModel = { modelId: string, providerId: string, +/** + * Expected memory footprint in megabytes if the registry knows it. + * `None` for cloud models (always-fits) and for local models whose + * row in `models.toml` doesn't yet declare a memory estimate. A + * follow-up adds an `estimated_memory_mb` field to the Model schema; + * until then memory-budget filtering is best-effort on local models + * (the resolver still rejects cloud models from `LocalOnly` queries). + */ +expectedMemoryMb?: number, targetSilicon: TargetSilicon, hwCapabilityTier: HwCapabilityTier, +/** + * Human-readable explanation of why this model was chosen. Surfaced + * in logs + UI when a persona's resolution changes (e.g., "switched + * from gpt-4o to claude-sonnet-4-5 because PreferLocal couldn't + * satisfy required Capability::Vision on this host"). + */ +reason: string, }; diff --git a/src/shared/generated/cognition/index.ts b/src/shared/generated/cognition/index.ts index 2bb2b8802..0b7a2861f 100644 --- a/src/shared/generated/cognition/index.ts +++ b/src/shared/generated/cognition/index.ts @@ -2,9 +2,15 @@ // Source: generator/generate-rust-bindings.ts // Re-generate: npx tsx generator/generate-rust-bindings.ts +export type { AdaptiveThroughputPlan } from './AdaptiveThroughputPlan'; +export type { AdaptiveThroughputRequest } from './AdaptiveThroughputRequest'; +export type { HostCapability } from './HostCapability'; +export type { HwCapabilityTier } from './HwCapabilityTier'; export type { LeverCall } from './LeverCall'; export type { LeverName } from './LeverName'; +export type { LocalOrCloudPolicy } from './LocalOrCloudPolicy'; export type { MediaItemLite } from './MediaItemLite'; +export type { ModelRequirement } from './ModelRequirement'; export type { NativeBatchOutcome } from './NativeBatchOutcome'; export type { ParsedToolBatch } from './ParsedToolBatch'; export type { PersonaMediaConfigLite } from './PersonaMediaConfigLite'; @@ -18,10 +24,19 @@ export type { RecipeRagSourcePolicy } from './RecipeRagSourcePolicy'; export type { RecipeTurnBatchPlan } from './RecipeTurnBatchPlan'; export type { RecipeTurnBatchRequest } from './RecipeTurnBatchRequest'; export type { RecipeTurnTrigger } from './RecipeTurnTrigger'; +export type { ResolutionError } from './ResolutionError'; +export type { ResolvedModel } from './ResolvedModel'; +export type { ResourceClass } from './ResourceClass'; export type { ResponderDecision } from './ResponderDecision'; export type { SharedAnalysis } from './SharedAnalysis'; export type { SharedAnalysisIntent } from './SharedAnalysisIntent'; export type { SharedRagSourcePlan } from './SharedRagSourcePlan'; +export type { TargetSilicon } from './TargetSilicon'; +export type { ThroughputJob } from './ThroughputJob'; +export type { ThroughputLaneBudget } from './ThroughputLaneBudget'; +export type { ThroughputLease } from './ThroughputLease'; +export type { ThroughputLeaseRevocationPolicy } from './ThroughputLeaseRevocationPolicy'; +export type { ThroughputLeaseSnapshot } from './ThroughputLeaseSnapshot'; export type { ToolExecutionContext } from './ToolExecutionContext'; export type { ToolInvocation } from './ToolInvocation'; export type { ToolOutcome } from './ToolOutcome'; diff --git a/src/shared/generated/model_registry/Arch.ts b/src/shared/generated/model_registry/Arch.ts new file mode 100644 index 000000000..1a5a81282 --- /dev/null +++ b/src/shared/generated/model_registry/Arch.ts @@ -0,0 +1,12 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * Model architecture family. Typed (not stringly-typed) so call sites + * use enum matching, not string comparison. Adding a new arch means: + * (a) add the variant here, (b) add a TOML row with `arch = "new_arch"`. + * Code that dispatches by arch gets a compile error reminding the author + * to handle the new variant — precisely the pattern Joel's axiom calls + * for ("code should NEVER know the model" — code knows the ARCHETYPES + * via this enum, models are data). + */ +export type Arch = "qwen2" | "qwen3" | "qwen35" | "llama" | "claude" | "gpt" | "gemini" | "grok" | "deepseek" | "unknown"; diff --git a/src/shared/generated/model_registry/index.ts b/src/shared/generated/model_registry/index.ts index 700da966a..afd28d110 100644 --- a/src/shared/generated/model_registry/index.ts +++ b/src/shared/generated/model_registry/index.ts @@ -2,4 +2,5 @@ // Source: generator/generate-rust-bindings.ts // Re-generate: npx tsx generator/generate-rust-bindings.ts +export type { Arch } from './Arch'; export type { Capability } from './Capability'; diff --git a/src/workers/continuum-core/src/cognition/mod.rs b/src/workers/continuum-core/src/cognition/mod.rs index 08358c12e..93156f21c 100644 --- a/src/workers/continuum-core/src/cognition/mod.rs +++ b/src/workers/continuum-core/src/cognition/mod.rs @@ -28,6 +28,7 @@ //! `ResponderDecision`) pub mod adaptive_throughput; +pub mod model_resolver; pub mod response_orchestrator; pub mod response_validator; pub mod shared_analysis; @@ -37,6 +38,7 @@ pub mod turn_batch; pub mod types; pub use adaptive_throughput::*; +pub use model_resolver::*; pub use response_orchestrator::{ DEFAULT_RELEVANCE_THRESHOLD, PersonaSlot, orchestrate, score_persona, }; diff --git a/src/workers/continuum-core/src/cognition/model_resolver.rs b/src/workers/continuum-core/src/cognition/model_resolver.rs new file mode 100644 index 000000000..de754e247 --- /dev/null +++ b/src/workers/continuum-core/src/cognition/model_resolver.rs @@ -0,0 +1,718 @@ +//! Model resolver — capability-shaped model selection. +//! +//! Pure contract for "given a ModelRequirement, which concrete model_id +//! satisfies it on this host?" Does not load models, initialize backends, +//! or call providers. Does not invent fallbacks: a requirement that cannot +//! be satisfied returns a typed [`ResolutionError`], not a best-guess model. +//! +//! Per Joel's rule (`fallbacks are illegal`): callers handle the error +//! explicitly. There is no fall-through to a base model — that turns silent +//! capability mismatches into runtime failures downstream. +//! +//! The resolver is the lookup half of the Adaptive Throughput Substrate. +//! `adaptive_throughput` plans LANES; this module picks WHICH MODEL fills +//! a given lane's request. The two share [`TargetSilicon`] as the join +//! key — `ResolvedModel.target_silicon` flows into +//! `ThroughputJob.target_silicon` when the resolver's output is admitted. +//! +//! Symmetrical to `adaptive_throughput.rs`: pure planner, callers re-invoke +//! when host capabilities change (e.g., another model evicted, GPU +//! pressure shifted). +//! +//! Source-of-truth ordering for model data: this module reads Models from +//! the typed registry (`crate::model_registry`). It does NOT itself read +//! `models.toml` or `models.json` — the registry already loaded both. + +use crate::cognition::adaptive_throughput::TargetSilicon; +use crate::model_registry::types::{Arch, Capability, Model}; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeSet; +use ts_rs::TS; + +/// Finer-grained hardware tier than [`TargetSilicon`]. Selects which model +/// VARIANT a host can run, not which physical-budget POOL admission uses. +/// +/// Example: `M1Uma8Gb` and `M3UmaProMax` both have +/// `target_silicon == TargetSilicon::UnifiedMemory`, but only the latter +/// can hold a 4B-parameter model alongside a 7B vision model. +/// +/// Lane B's lease layer + adaptive_throughput's budgets care about the +/// pool (TargetSilicon). Lane C's resolver cares about the variant +/// (HwCapabilityTier). +#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, TS)] +#[serde(rename_all = "snake_case")] +#[ts( + export, + export_to = "../../../shared/generated/cognition/HwCapabilityTier.ts" +)] +pub enum HwCapabilityTier { + /// No GPU, no NPU. Inference happens on CPU only. + CpuOnly, + /// Apple M1, 8GB unified memory. MBA-tier baseline. + M1Uma8Gb, + /// Apple M1/M2, 16GB unified memory. + M1Uma16Gb, + /// Apple M2/M3 Pro/Max, 32GB+ unified memory. + M2UmaProMax, + /// Apple M3 Pro/Max/Ultra, 32GB+ unified memory. + M3UmaProMax, + /// nVidia compute capability 7.0 (V100). + Sm70, + /// nVidia compute capability 8.0 (A100). + Sm80, + /// nVidia compute capability 8.6 (RTX 30xx, A40). + Sm86, + /// nVidia compute capability 8.9 (RTX 40xx). + Sm89, + /// nVidia compute capability 9.0 (H100). + Sm90, + /// nVidia compute capability 12.0 (RTX 50xx). + Sm120, + /// AMD GPU via Vulkan backend. + VulkanAmd, + /// Remote inference — host capability irrelevant. + Cloud, +} + +/// How aggressively to prefer local vs cloud providers. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize, TS)] +#[serde(rename_all = "snake_case")] +#[ts( + export, + export_to = "../../../shared/generated/cognition/LocalOrCloudPolicy.ts" +)] +pub enum LocalOrCloudPolicy { + /// Match local providers only. Cloud models are filtered out. + LocalOnly, + /// Match cloud providers only. Local models are filtered out. + CloudOnly, + /// Both eligible; rank local higher in the result. + PreferLocal, + /// Both eligible; rank cloud higher in the result. + PreferCloud, + /// Both eligible; no ranking preference. + Any, +} + +/// What the resolver knows about THIS machine. Caller populates from a +/// hardware-detection probe at boot (see future `device_probe` module). +/// The resolver consumes this as a snapshot — re-invoke when probe values +/// change. +#[derive(Debug, Clone, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/cognition/HostCapability.ts" +)] +pub struct HostCapability { + pub hw_capability_tier: HwCapabilityTier, + /// Memory available for inference workloads in megabytes. For unified- + /// memory hosts this is the share inference is willing to claim, not + /// total system RAM. + pub available_memory_mb: u32, + /// Which physical-budget pool inference workloads on this host should + /// admit against. Mac M-series → `UnifiedMemory`; nVidia → `Gpu`; + /// CPU-only → `Cpu`. + pub primary_target_silicon: TargetSilicon, +} + +/// Capability-shaped query for the resolver. Callers describe what the +/// model needs to DO (generate text, see images, etc.) — not which model +/// to use. Per Joel's axiom: code knows ARCHETYPES, models are data. +#[derive(Debug, Clone, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/cognition/ModelRequirement.ts" +)] +pub struct ModelRequirement { + /// Capabilities every candidate must advertise. Empty set matches any + /// model (rare — usually callers want at least `Chat`). + pub required_capabilities: BTreeSet, + /// Architectural family preference. Empty = any architecture qualifies. + /// When non-empty, candidates outside the preference are filtered out + /// rather than down-ranked — caller wants this family or none. + #[serde(default)] + pub arch_preference: Vec, + /// Minimum context window in tokens. `0` = any. + #[serde(default)] + pub context_window_min: u32, + /// Maximum memory the resolved model may consume on this host, in MB. + /// `None` = use `host.available_memory_mb` as the implicit cap. + #[ts(optional)] + pub memory_budget_mb: Option, + /// Local-vs-cloud preference. See [`LocalOrCloudPolicy`]. + pub provider_policy: LocalOrCloudPolicy, + /// Host capability snapshot. See [`HostCapability`]. + pub host: HostCapability, +} + +/// Resolver output. Includes the silicon target so the caller can plumb it +/// straight into a [`ThroughputJob`] without re-deriving it from the +/// model + host. +#[derive(Debug, Clone, Serialize, Deserialize, TS)] +#[serde(rename_all = "camelCase")] +#[ts( + export, + export_to = "../../../shared/generated/cognition/ResolvedModel.ts" +)] +pub struct ResolvedModel { + pub model_id: String, + pub provider_id: String, + /// Expected memory footprint in megabytes if the registry knows it. + /// `None` for cloud models (always-fits) and for local models whose + /// row in `models.toml` doesn't yet declare a memory estimate. A + /// follow-up adds an `estimated_memory_mb` field to the Model schema; + /// until then memory-budget filtering is best-effort on local models + /// (the resolver still rejects cloud models from `LocalOnly` queries). + #[ts(optional)] + pub expected_memory_mb: Option, + pub target_silicon: TargetSilicon, + pub hw_capability_tier: HwCapabilityTier, + /// Human-readable explanation of why this model was chosen. Surfaced + /// in logs + UI when a persona's resolution changes (e.g., "switched + /// from gpt-4o to claude-sonnet-4-5 because PreferLocal couldn't + /// satisfy required Capability::Vision on this host"). + pub reason: String, +} + +/// Why a [`resolve_model`] call failed. Each variant names the SPECIFIC +/// filter that eliminated all candidates so the caller's error message +/// can be actionable. +/// +/// No `Fallback` variant. Per Joel's rule: missing-model is an error, not +/// a soft retry on a default. Callers that want graceful degradation must +/// EXPLICITLY relax their requirement and re-invoke. +#[derive(Debug, Clone, Serialize, Deserialize, TS, thiserror::Error)] +#[serde(rename_all = "camelCase", tag = "kind")] +#[ts( + export, + export_to = "../../../shared/generated/cognition/ResolutionError.ts" +)] +pub enum ResolutionError { + #[error( + "no model satisfies requirement: {registry_count} models in registry, \ + {candidates_after_filter} survived filtering. unmet: {unmet_filters:?}" + )] + NoModelMatchesRequirement { + registry_count: usize, + candidates_after_filter: usize, + unmet_filters: Vec, + }, +} + +/// Provider ids treated as local. Hardcoded for v1; a follow-up moves this +/// to a `kind: local|cloud` field on `Provider` in `providers.toml`. +const LOCAL_PROVIDER_IDS: &[&str] = &["llamacpp-local", "docker-model-runner"]; + +fn is_local_provider(provider_id: &str) -> bool { + LOCAL_PROVIDER_IDS.contains(&provider_id) +} + +fn derive_target_silicon(model: &Model, host: &HostCapability) -> TargetSilicon { + if is_local_provider(&model.provider) { + host.primary_target_silicon + } else { + TargetSilicon::Cloud + } +} + +/// Resolve a [`ModelRequirement`] against a model catalog. Pure: caller +/// supplies the iterator of [`Model`] (typically `registry.models()`). +/// +/// Filter order (each step records the unmet predicate when it eliminates +/// the last candidate, so the error names the specific cause): +/// 1. `required_capabilities` — every cap must be advertised +/// 2. `arch_preference` — when non-empty, must match +/// 3. `context_window_min` — model's window ≥ requirement +/// 4. `provider_policy` — Local/Cloud filter +/// 5. memory budget — local models with declared estimates only +/// +/// Returns the first survivor under the policy's ranking. `PreferLocal` +/// puts local providers first; `PreferCloud` puts cloud providers first; +/// other policies preserve registry order. +pub fn resolve_model<'a, I>( + requirement: &ModelRequirement, + models: I, +) -> Result +where + I: IntoIterator, +{ + let registry: Vec<&Model> = models.into_iter().collect(); + let registry_count = registry.len(); + let mut unmet: Vec = Vec::new(); + + // Filter 1: required capabilities. + let mut candidates: Vec<&Model> = registry + .iter() + .copied() + .filter(|m| { + requirement + .required_capabilities + .iter() + .all(|c| m.has(*c)) + }) + .collect(); + if candidates.is_empty() && !requirement.required_capabilities.is_empty() { + unmet.push(format!( + "required_capabilities={:?}", + requirement.required_capabilities + )); + return Err(ResolutionError::NoModelMatchesRequirement { + registry_count, + candidates_after_filter: 0, + unmet_filters: unmet, + }); + } + + // Filter 2: arch preference. + if !requirement.arch_preference.is_empty() { + let after_arch: Vec<&Model> = candidates + .iter() + .copied() + .filter(|m| requirement.arch_preference.contains(&m.arch)) + .collect(); + if after_arch.is_empty() { + unmet.push(format!( + "arch_preference={:?} (no survivor matched)", + requirement.arch_preference + )); + return Err(ResolutionError::NoModelMatchesRequirement { + registry_count, + candidates_after_filter: 0, + unmet_filters: unmet, + }); + } + candidates = after_arch; + } + + // Filter 3: context window minimum. + if requirement.context_window_min > 0 { + let before = candidates.len(); + candidates.retain(|m| m.context_window >= requirement.context_window_min); + if candidates.is_empty() { + unmet.push(format!( + "context_window_min={} (eliminated {} candidates)", + requirement.context_window_min, before + )); + return Err(ResolutionError::NoModelMatchesRequirement { + registry_count, + candidates_after_filter: 0, + unmet_filters: unmet, + }); + } + } + + // Filter 4: provider policy. + let before_provider = candidates.len(); + candidates.retain(|m| match requirement.provider_policy { + LocalOrCloudPolicy::LocalOnly => is_local_provider(&m.provider), + LocalOrCloudPolicy::CloudOnly => !is_local_provider(&m.provider), + LocalOrCloudPolicy::PreferLocal + | LocalOrCloudPolicy::PreferCloud + | LocalOrCloudPolicy::Any => true, + }); + if candidates.is_empty() { + unmet.push(format!( + "provider_policy={:?} (eliminated {} candidates)", + requirement.provider_policy, before_provider + )); + return Err(ResolutionError::NoModelMatchesRequirement { + registry_count, + candidates_after_filter: 0, + unmet_filters: unmet, + }); + } + + // Rank: PreferLocal/PreferCloud reorder; other policies preserve order. + match requirement.provider_policy { + LocalOrCloudPolicy::PreferLocal => { + candidates.sort_by_key(|m| u8::from(!is_local_provider(&m.provider))); + } + LocalOrCloudPolicy::PreferCloud => { + candidates.sort_by_key(|m| u8::from(is_local_provider(&m.provider))); + } + _ => {} + } + + let best = candidates.first().expect("non-empty after filters"); + let target_silicon = derive_target_silicon(best, &requirement.host); + let reason = format!( + "matched {} required capability(ies) on arch={:?}, context={}, provider={}, policy={:?}", + requirement.required_capabilities.len(), + best.arch, + best.context_window, + best.provider, + requirement.provider_policy, + ); + + Ok(ResolvedModel { + model_id: best.id.clone(), + provider_id: best.provider.clone(), + // expected_memory_mb stays None until the Model schema gains an + // `estimated_memory_mb` field. Not blocking for v1; the + // LocalOnly/CloudOnly filter already prevents the worst class of + // mis-routing (running a 7B model on the cloud lane). + expected_memory_mb: None, + target_silicon, + hw_capability_tier: requirement.host.hw_capability_tier, + reason, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::model_registry::types::MultiPartyChatStrategy; + + fn make_model( + id: &str, + provider: &str, + arch: Arch, + context_window: u32, + caps: &[Capability], + ) -> Model { + Model { + id: id.into(), + name: None, + provider: provider.into(), + arch, + context_window, + max_output_tokens: 4096, + tokens_per_second: 50.0, + capabilities: caps.iter().copied().collect(), + cost_input_per_1k: 0.0, + cost_output_per_1k: 0.0, + gguf_hint: None, + gguf_local_path: None, + mmproj_local_path: None, + chat_template: None, + multi_party_strategy: MultiPartyChatStrategy::default(), + stop_sequences: vec![], + } + } + + fn host_m1_8gb() -> HostCapability { + HostCapability { + hw_capability_tier: HwCapabilityTier::M1Uma8Gb, + available_memory_mb: 6144, + primary_target_silicon: TargetSilicon::UnifiedMemory, + } + } + + fn host_rtx5090() -> HostCapability { + HostCapability { + hw_capability_tier: HwCapabilityTier::Sm120, + available_memory_mb: 32768, + primary_target_silicon: TargetSilicon::Gpu, + } + } + + fn host_cpu_only() -> HostCapability { + HostCapability { + hw_capability_tier: HwCapabilityTier::CpuOnly, + available_memory_mb: 8192, + primary_target_silicon: TargetSilicon::Cpu, + } + } + + fn registry() -> Vec { + vec![ + make_model( + "claude-sonnet-4-5-20250929", + "anthropic", + Arch::Claude, + 200_000, + &[ + Capability::TextGeneration, + Capability::Chat, + Capability::ToolUse, + Capability::Vision, + Capability::Streaming, + ], + ), + make_model( + "gpt-4o", + "openai", + Arch::Gpt, + 128_000, + &[ + Capability::TextGeneration, + Capability::Chat, + Capability::Vision, + Capability::AudioInput, + Capability::AudioOutput, + ], + ), + make_model( + "continuum-ai/qwen3.5-4b-code-forged-GGUF", + "llamacpp-local", + Arch::Qwen35, + 262_144, + &[ + Capability::TextGeneration, + Capability::Chat, + Capability::ToolUse, + ], + ), + make_model( + "qwen2-vl-7b-instruct", + "llamacpp-local", + Arch::Qwen2, + 32_768, + &[ + Capability::TextGeneration, + Capability::Chat, + Capability::Vision, + ], + ), + make_model( + "qwen2-0.5b-gating", + "llamacpp-local", + Arch::Qwen2, + 8_192, + &[Capability::TextGeneration, Capability::Chat], + ), + ] + } + + fn req_chat_local(host: HostCapability) -> ModelRequirement { + ModelRequirement { + required_capabilities: [Capability::Chat].iter().copied().collect(), + arch_preference: vec![], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::LocalOnly, + host, + } + } + + fn req_vision_local(host: HostCapability) -> ModelRequirement { + ModelRequirement { + required_capabilities: [Capability::Chat, Capability::Vision] + .iter() + .copied() + .collect(), + arch_preference: vec![], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::LocalOnly, + host, + } + } + + #[test] + fn local_chat_resolves_to_qwen35_on_m1() { + let r = registry(); + let resolved = resolve_model(&req_chat_local(host_m1_8gb()), r.iter()).unwrap(); + assert_eq!(resolved.provider_id, "llamacpp-local"); + assert!( + resolved.model_id.starts_with("continuum-ai/qwen3.5") || resolved.model_id.starts_with("qwen2"), + "expected a local qwen model, got {}", + resolved.model_id, + ); + assert_eq!(resolved.target_silicon, TargetSilicon::UnifiedMemory); + assert_eq!(resolved.hw_capability_tier, HwCapabilityTier::M1Uma8Gb); + } + + #[test] + fn vision_request_resolves_to_qwen2_vl() { + let r = registry(); + let resolved = resolve_model(&req_vision_local(host_rtx5090()), r.iter()).unwrap(); + assert_eq!(resolved.model_id, "qwen2-vl-7b-instruct"); + assert_eq!(resolved.provider_id, "llamacpp-local"); + assert_eq!(resolved.target_silicon, TargetSilicon::Gpu); + assert_eq!(resolved.hw_capability_tier, HwCapabilityTier::Sm120); + } + + #[test] + fn cloud_only_skips_local_models() { + let r = registry(); + let mut req = req_chat_local(host_rtx5090()); + req.provider_policy = LocalOrCloudPolicy::CloudOnly; + let resolved = resolve_model(&req, r.iter()).unwrap(); + assert!( + ["anthropic", "openai"].contains(&resolved.provider_id.as_str()), + "expected cloud provider, got {}", + resolved.provider_id, + ); + assert_eq!(resolved.target_silicon, TargetSilicon::Cloud); + } + + #[test] + fn missing_capability_errors_no_fallback() { + let r = registry(); + let req = ModelRequirement { + required_capabilities: [Capability::ImageGeneration].iter().copied().collect(), + arch_preference: vec![], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::Any, + host: host_rtx5090(), + }; + let err = resolve_model(&req, r.iter()).unwrap_err(); + let ResolutionError::NoModelMatchesRequirement { + registry_count, + candidates_after_filter, + unmet_filters, + } = err; + assert_eq!(registry_count, r.len()); + assert_eq!(candidates_after_filter, 0); + assert!( + unmet_filters.iter().any(|f| f.contains("ImageGeneration")), + "unmet filters should name ImageGeneration: {unmet_filters:?}" + ); + } + + #[test] + fn vision_with_local_only_on_cpu_host_still_finds_local_vision_model() { + // Even on a CPU-only host, the resolver should return the local + // vision model — admission/feasibility is the substrate's job + // (adaptive_throughput will refuse the lane if the host can't + // run it). The resolver answers "what fits the requirement," + // not "what will succeed at inference time." + let r = registry(); + let resolved = resolve_model(&req_vision_local(host_cpu_only()), r.iter()).unwrap(); + assert_eq!(resolved.model_id, "qwen2-vl-7b-instruct"); + assert_eq!(resolved.target_silicon, TargetSilicon::Cpu); + assert_eq!(resolved.hw_capability_tier, HwCapabilityTier::CpuOnly); + } + + #[test] + fn context_window_min_filters_small_models() { + let r = registry(); + let req = ModelRequirement { + required_capabilities: [Capability::Chat].iter().copied().collect(), + arch_preference: vec![], + context_window_min: 100_000, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::LocalOnly, + host: host_rtx5090(), + }; + let resolved = resolve_model(&req, r.iter()).unwrap(); + // Only qwen3.5-4b (262144 ctx) survives among local with ≥100k window. + assert_eq!(resolved.model_id, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + } + + #[test] + fn arch_preference_filters_to_qwen35_only() { + let r = registry(); + let req = ModelRequirement { + required_capabilities: [Capability::Chat].iter().copied().collect(), + arch_preference: vec![Arch::Qwen35], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::Any, + host: host_rtx5090(), + }; + let resolved = resolve_model(&req, r.iter()).unwrap(); + assert_eq!(resolved.model_id, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + } + + #[test] + fn prefer_local_ranks_local_first() { + let r = registry(); + let req = ModelRequirement { + required_capabilities: [Capability::Chat, Capability::Vision] + .iter() + .copied() + .collect(), + arch_preference: vec![], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::PreferLocal, + host: host_rtx5090(), + }; + let resolved = resolve_model(&req, r.iter()).unwrap(); + assert_eq!(resolved.provider_id, "llamacpp-local"); + assert_eq!(resolved.model_id, "qwen2-vl-7b-instruct"); + } + + #[test] + fn prefer_cloud_ranks_cloud_first() { + let r = registry(); + let req = ModelRequirement { + required_capabilities: [Capability::Chat, Capability::Vision] + .iter() + .copied() + .collect(), + arch_preference: vec![], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::PreferCloud, + host: host_rtx5090(), + }; + let resolved = resolve_model(&req, r.iter()).unwrap(); + assert!( + ["anthropic", "openai"].contains(&resolved.provider_id.as_str()), + "expected cloud first, got {}", + resolved.provider_id, + ); + } + + #[test] + fn five_persona_resolution_smoke() { + // Lane C contract test: 5 personas with different needs all + // resolve to the correct concrete model + missing path errors. + let r = registry(); + + // Persona 1: Helper AI — local chat. + let helper = resolve_model(&req_chat_local(host_m1_8gb()), r.iter()).unwrap(); + assert_eq!(helper.provider_id, "llamacpp-local"); + + // Persona 2: Vision AI — local vision. + let vision = resolve_model(&req_vision_local(host_m1_8gb()), r.iter()).unwrap(); + assert_eq!(vision.model_id, "qwen2-vl-7b-instruct"); + + // Persona 3: Cloud-only persona — wants vision via cloud. + let mut cloud_vision_req = req_vision_local(host_m1_8gb()); + cloud_vision_req.provider_policy = LocalOrCloudPolicy::CloudOnly; + let cloud_vision = resolve_model(&cloud_vision_req, r.iter()).unwrap(); + assert!( + ["anthropic", "openai"].contains(&cloud_vision.provider_id.as_str()), + "expected cloud, got {}", + cloud_vision.provider_id, + ); + + // Persona 4: Audio-input persona on cloud only (no local audio model + // in registry — should resolve to gpt-4o which has audio-input). + let mut audio_req = req_chat_local(host_rtx5090()); + audio_req.required_capabilities = [Capability::Chat, Capability::AudioInput] + .iter() + .copied() + .collect(); + audio_req.provider_policy = LocalOrCloudPolicy::Any; + let audio = resolve_model(&audio_req, r.iter()).unwrap(); + assert_eq!(audio.model_id, "gpt-4o"); + + // Persona 5: Code persona requiring tool-use — qwen3.5 OR claude. + let mut code_req = req_chat_local(host_rtx5090()); + code_req.required_capabilities = [Capability::Chat, Capability::ToolUse] + .iter() + .copied() + .collect(); + code_req.provider_policy = LocalOrCloudPolicy::PreferLocal; + let code = resolve_model(&code_req, r.iter()).unwrap(); + assert_eq!(code.provider_id, "llamacpp-local"); + assert_eq!(code.model_id, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + + // Missing-model error path: persona requires ImageGeneration which + // none of the registered models advertise. Must error, not fall + // back. + let img_req = ModelRequirement { + required_capabilities: [Capability::ImageGeneration].iter().copied().collect(), + arch_preference: vec![], + context_window_min: 0, + memory_budget_mb: None, + provider_policy: LocalOrCloudPolicy::Any, + host: host_rtx5090(), + }; + assert!( + matches!( + resolve_model(&img_req, r.iter()), + Err(ResolutionError::NoModelMatchesRequirement { .. }) + ), + "missing capability must error, not fall back" + ); + } +} diff --git a/src/workers/continuum-core/src/model_registry/types.rs b/src/workers/continuum-core/src/model_registry/types.rs index 42eb461b9..33aa1376c 100644 --- a/src/workers/continuum-core/src/model_registry/types.rs +++ b/src/workers/continuum-core/src/model_registry/types.rs @@ -16,7 +16,13 @@ use std::path::PathBuf; /// to handle the new variant — precisely the pattern Joel's axiom calls /// for ("code should NEVER know the model" — code knows the ARCHETYPES /// via this enum, models are data). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, ts_rs::TS, +)] +#[ts( + export, + export_to = "../../../shared/generated/model_registry/Arch.ts" +)] #[serde(rename_all = "snake_case")] pub enum Arch { Qwen2, From d45788e24c29f60d141807e91730460044c43e20 Mon Sep 17 00:00:00 2001 From: Test Date: Thu, 7 May 2026 22:08:29 -0500 Subject: [PATCH 2/3] Doc: HwCapabilityTier closed-by-design + memory_budget_mb not-yet-enforced MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address sibling Mac review on PR #1066 — non-blocking doc-clarity flags: (α) HwCapabilityTier doc: spell out the closed-enum design choice. New hardware classes require enum-edit + ts-rs regen + an explicit alias decision. No Other(String) / wildcard fallback variant by design — silent routing to a default tier hides exactly the capacity-mismatch bugs the resolver exists to catch. Per Joel's no-fallback rule. (β) ModelRequirement.memory_budget_mb doc: explicitly state OBSERVED but NOT ENFORCED until Model schema gains estimated_memory_mb. Without this note, callers may pass it expecting filtering and silently get over- budget models. Loud-fail on memory pressure is a downstream Lane B (FootprintRegistry / PressureBroker) concern, not a resolver filter. ts-rs regenerated HwCapabilityTier.ts + ModelRequirement.ts with new docstrings. cargo test --features metal,accelerate cognition::model_resolver: 16/16 still green. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../generated/cognition/HwCapabilityTier.ts | 10 ++++++++++ .../generated/cognition/ModelRequirement.ts | 10 ++++++++++ .../src/cognition/model_resolver.rs | 20 +++++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/src/shared/generated/cognition/HwCapabilityTier.ts b/src/shared/generated/cognition/HwCapabilityTier.ts index 2f239ec18..bdcd75b0f 100644 --- a/src/shared/generated/cognition/HwCapabilityTier.ts +++ b/src/shared/generated/cognition/HwCapabilityTier.ts @@ -11,5 +11,15 @@ * Lane B's lease layer + adaptive_throughput's budgets care about the * pool (TargetSilicon). Lane C's resolver cares about the variant * (HwCapabilityTier). + * + * **Closed enum by design.** New hardware classes (RTX 6090 → `Sm130`, + * M4, future Apple silicon) require an enum-edit + ts-rs regen + an + * explicit decision on which existing variant — if any — they alias to. + * There is intentionally no `Other(String)` or wildcard fallback variant: + * "unknown hardware" silently routing to a default tier hides + * capacity-mismatch bugs the resolver exists to catch. See Joel's rule + * on no fallbacks (`docs/architecture/...`). Adding a tier means the + * caller's hardware probe must produce it AND every match-on-tier site + * gets a compile error reminding the author to handle it. */ export type HwCapabilityTier = "cpu_only" | "m1_uma8_gb" | "m1_uma16_gb" | "m2_uma_pro_max" | "m3_uma_pro_max" | "sm70" | "sm80" | "sm86" | "sm89" | "sm90" | "sm120" | "vulkan_amd" | "cloud"; diff --git a/src/shared/generated/cognition/ModelRequirement.ts b/src/shared/generated/cognition/ModelRequirement.ts index 95c4e8de9..c547d256e 100644 --- a/src/shared/generated/cognition/ModelRequirement.ts +++ b/src/shared/generated/cognition/ModelRequirement.ts @@ -28,6 +28,16 @@ contextWindowMin: number, /** * Maximum memory the resolved model may consume on this host, in MB. * `None` = use `host.available_memory_mb` as the implicit cap. + * + * **Currently OBSERVED but NOT ENFORCED.** Memory-budget filtering + * requires the [`Model`] schema to gain an `estimated_memory_mb` + * field — tracked as a separate followup. Until then, callers that + * pass this expecting filtering will silently get over-budget + * models. The `LocalOnly` / `CloudOnly` filter still prevents the + * worst class of mis-routing (running a 7B local model on the cloud + * lane). Loud-fail on memory pressure is a Lane B + * (FootprintRegistry / PressureBroker) concern downstream of + * resolution, not a resolver-side filter. */ memoryBudgetMb?: number, /** diff --git a/src/workers/continuum-core/src/cognition/model_resolver.rs b/src/workers/continuum-core/src/cognition/model_resolver.rs index de754e247..df9518fe1 100644 --- a/src/workers/continuum-core/src/cognition/model_resolver.rs +++ b/src/workers/continuum-core/src/cognition/model_resolver.rs @@ -39,6 +39,16 @@ use ts_rs::TS; /// Lane B's lease layer + adaptive_throughput's budgets care about the /// pool (TargetSilicon). Lane C's resolver cares about the variant /// (HwCapabilityTier). +/// +/// **Closed enum by design.** New hardware classes (RTX 6090 → `Sm130`, +/// M4, future Apple silicon) require an enum-edit + ts-rs regen + an +/// explicit decision on which existing variant — if any — they alias to. +/// There is intentionally no `Other(String)` or wildcard fallback variant: +/// "unknown hardware" silently routing to a default tier hides +/// capacity-mismatch bugs the resolver exists to catch. See Joel's rule +/// on no fallbacks (`docs/architecture/...`). Adding a tier means the +/// caller's hardware probe must produce it AND every match-on-tier site +/// gets a compile error reminding the author to handle it. #[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize, TS)] #[serde(rename_all = "snake_case")] #[ts( @@ -139,6 +149,16 @@ pub struct ModelRequirement { pub context_window_min: u32, /// Maximum memory the resolved model may consume on this host, in MB. /// `None` = use `host.available_memory_mb` as the implicit cap. + /// + /// **Currently OBSERVED but NOT ENFORCED.** Memory-budget filtering + /// requires the [`Model`] schema to gain an `estimated_memory_mb` + /// field — tracked as a separate followup. Until then, callers that + /// pass this expecting filtering will silently get over-budget + /// models. The `LocalOnly` / `CloudOnly` filter still prevents the + /// worst class of mis-routing (running a 7B local model on the cloud + /// lane). Loud-fail on memory pressure is a Lane B + /// (FootprintRegistry / PressureBroker) concern downstream of + /// resolution, not a resolver-side filter. #[ts(optional)] pub memory_budget_mb: Option, /// Local-vs-cloud preference. See [`LocalOrCloudPolicy`]. From fbdc357f8d703098052a8807da0a710351f7e57e Mon Sep 17 00:00:00 2001 From: Test Date: Thu, 7 May 2026 22:16:36 -0500 Subject: [PATCH 3/3] Make model resolver provider residency data-driven --- .../generated/cognition/HwCapabilityTier.ts | 2 +- .../generated/cognition/ModelRequirement.ts | 15 -- .../generated/model_registry/ProviderKind.ts | 10 + src/shared/generated/model_registry/index.ts | 1 + .../continuum-core/config/providers.toml | 2 + .../src/cognition/model_resolver.rs | 229 ++++++++++++------ .../src/model_registry/types.rs | 46 +++- 7 files changed, 208 insertions(+), 97 deletions(-) create mode 100644 src/shared/generated/model_registry/ProviderKind.ts diff --git a/src/shared/generated/cognition/HwCapabilityTier.ts b/src/shared/generated/cognition/HwCapabilityTier.ts index bdcd75b0f..e8ea51d22 100644 --- a/src/shared/generated/cognition/HwCapabilityTier.ts +++ b/src/shared/generated/cognition/HwCapabilityTier.ts @@ -22,4 +22,4 @@ * caller's hardware probe must produce it AND every match-on-tier site * gets a compile error reminding the author to handle it. */ -export type HwCapabilityTier = "cpu_only" | "m1_uma8_gb" | "m1_uma16_gb" | "m2_uma_pro_max" | "m3_uma_pro_max" | "sm70" | "sm80" | "sm86" | "sm89" | "sm90" | "sm120" | "vulkan_amd" | "cloud"; +export type HwCapabilityTier = "cpu_only" | "m1_uma8_gb" | "m1_uma16_gb" | "m2_uma_pro_max" | "m3_uma_pro_max" | "sm70" | "sm75" | "sm80" | "sm86" | "sm89" | "sm90" | "sm100" | "sm120" | "vulkan_amd" | "cloud"; diff --git a/src/shared/generated/cognition/ModelRequirement.ts b/src/shared/generated/cognition/ModelRequirement.ts index c547d256e..643bbe1cb 100644 --- a/src/shared/generated/cognition/ModelRequirement.ts +++ b/src/shared/generated/cognition/ModelRequirement.ts @@ -25,21 +25,6 @@ archPreference: Array, * Minimum context window in tokens. `0` = any. */ contextWindowMin: number, -/** - * Maximum memory the resolved model may consume on this host, in MB. - * `None` = use `host.available_memory_mb` as the implicit cap. - * - * **Currently OBSERVED but NOT ENFORCED.** Memory-budget filtering - * requires the [`Model`] schema to gain an `estimated_memory_mb` - * field — tracked as a separate followup. Until then, callers that - * pass this expecting filtering will silently get over-budget - * models. The `LocalOnly` / `CloudOnly` filter still prevents the - * worst class of mis-routing (running a 7B local model on the cloud - * lane). Loud-fail on memory pressure is a Lane B - * (FootprintRegistry / PressureBroker) concern downstream of - * resolution, not a resolver-side filter. - */ -memoryBudgetMb?: number, /** * Local-vs-cloud preference. See [`LocalOrCloudPolicy`]. */ diff --git a/src/shared/generated/model_registry/ProviderKind.ts b/src/shared/generated/model_registry/ProviderKind.ts new file mode 100644 index 000000000..82d216be9 --- /dev/null +++ b/src/shared/generated/model_registry/ProviderKind.ts @@ -0,0 +1,10 @@ +// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually. + +/** + * Where a provider runs its inference. Resolver consumes this to honor + * `LocalOrCloudPolicy` without needing a hardcoded provider-id list. + * Providers default to [`ProviderKind::Cloud`] so adding a new cloud + * provider TOML row doesn't require an explicit `kind` line; local + * providers MUST declare `kind = "local"` explicitly. + */ +export type ProviderKind = "local" | "cloud"; diff --git a/src/shared/generated/model_registry/index.ts b/src/shared/generated/model_registry/index.ts index afd28d110..fa4bac8f0 100644 --- a/src/shared/generated/model_registry/index.ts +++ b/src/shared/generated/model_registry/index.ts @@ -4,3 +4,4 @@ export type { Arch } from './Arch'; export type { Capability } from './Capability'; +export type { ProviderKind } from './ProviderKind'; diff --git a/src/workers/continuum-core/config/providers.toml b/src/workers/continuum-core/config/providers.toml index baa631081..6bad70160 100644 --- a/src/workers/continuum-core/config/providers.toml +++ b/src/workers/continuum-core/config/providers.toml @@ -82,6 +82,7 @@ model_prefixes = ["gemini"] [[provider]] id = "docker-model-runner" name = "Docker Model Runner (local Metal/CUDA)" +kind = "local" # IPv4 literal on purpose — `localhost` on macOS resolves to both ::1 and # 127.0.0.1 and Docker Desktop's model runner listens on IPv4 only. When # the hyper client tries ::1 first it waits for the connect path to fall @@ -98,6 +99,7 @@ auth = "none" [[provider]] id = "llamacpp-local" name = "Llama.cpp (in-process Metal/CUDA)" +kind = "local" base_url = "in-process" auth = "none" default_model = "continuum-ai/qwen3.5-4b-code-forged-GGUF" diff --git a/src/workers/continuum-core/src/cognition/model_resolver.rs b/src/workers/continuum-core/src/cognition/model_resolver.rs index df9518fe1..45f13b850 100644 --- a/src/workers/continuum-core/src/cognition/model_resolver.rs +++ b/src/workers/continuum-core/src/cognition/model_resolver.rs @@ -24,9 +24,9 @@ //! `models.toml` or `models.json` — the registry already loaded both. use crate::cognition::adaptive_throughput::TargetSilicon; -use crate::model_registry::types::{Arch, Capability, Model}; +use crate::model_registry::types::{Arch, Capability, Model, Provider, ProviderKind}; use serde::{Deserialize, Serialize}; -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use ts_rs::TS; /// Finer-grained hardware tier than [`TargetSilicon`]. Selects which model @@ -68,6 +68,9 @@ pub enum HwCapabilityTier { M3UmaProMax, /// nVidia compute capability 7.0 (V100). Sm70, + /// nVidia compute capability 7.5 (T4 datacenter, RTX 20xx, GTX 16xx). + /// Common on cloud GPU inference instances. + Sm75, /// nVidia compute capability 8.0 (A100). Sm80, /// nVidia compute capability 8.6 (RTX 30xx, A40). @@ -76,7 +79,11 @@ pub enum HwCapabilityTier { Sm89, /// nVidia compute capability 9.0 (H100). Sm90, - /// nVidia compute capability 12.0 (RTX 50xx). + /// nVidia compute capability 10.0 (Blackwell datacenter B100/B200, + /// HBM3e). Distinct from `Sm120` — Blackwell-consumer (RTX 50xx) and + /// Blackwell-datacenter take different driver paths. + Sm100, + /// nVidia compute capability 12.0 (RTX 50xx Blackwell-consumer). Sm120, /// AMD GPU via Vulkan backend. VulkanAmd, @@ -147,20 +154,6 @@ pub struct ModelRequirement { /// Minimum context window in tokens. `0` = any. #[serde(default)] pub context_window_min: u32, - /// Maximum memory the resolved model may consume on this host, in MB. - /// `None` = use `host.available_memory_mb` as the implicit cap. - /// - /// **Currently OBSERVED but NOT ENFORCED.** Memory-budget filtering - /// requires the [`Model`] schema to gain an `estimated_memory_mb` - /// field — tracked as a separate followup. Until then, callers that - /// pass this expecting filtering will silently get over-budget - /// models. The `LocalOnly` / `CloudOnly` filter still prevents the - /// worst class of mis-routing (running a 7B local model on the cloud - /// lane). Loud-fail on memory pressure is a Lane B - /// (FootprintRegistry / PressureBroker) concern downstream of - /// resolution, not a resolver-side filter. - #[ts(optional)] - pub memory_budget_mb: Option, /// Local-vs-cloud preference. See [`LocalOrCloudPolicy`]. pub provider_policy: LocalOrCloudPolicy, /// Host capability snapshot. See [`HostCapability`]. @@ -221,43 +214,54 @@ pub enum ResolutionError { }, } -/// Provider ids treated as local. Hardcoded for v1; a follow-up moves this -/// to a `kind: local|cloud` field on `Provider` in `providers.toml`. -const LOCAL_PROVIDER_IDS: &[&str] = &["llamacpp-local", "docker-model-runner"]; - -fn is_local_provider(provider_id: &str) -> bool { - LOCAL_PROVIDER_IDS.contains(&provider_id) -} - -fn derive_target_silicon(model: &Model, host: &HostCapability) -> TargetSilicon { - if is_local_provider(&model.provider) { - host.primary_target_silicon - } else { - TargetSilicon::Cloud +fn derive_target_silicon( + model: &Model, + provider_kinds: &HashMap<&str, ProviderKind>, + host: &HostCapability, +) -> TargetSilicon { + let kind = provider_kinds + .get(model.provider.as_str()) + .copied() + .unwrap_or_default(); // ProviderKind::Cloud — unknown provider treated as cloud + match kind { + ProviderKind::Local => host.primary_target_silicon, + ProviderKind::Cloud => TargetSilicon::Cloud, } } -/// Resolve a [`ModelRequirement`] against a model catalog. Pure: caller -/// supplies the iterator of [`Model`] (typically `registry.models()`). +/// Resolve a [`ModelRequirement`] against a model catalog + provider table. +/// Pure: caller supplies iterators of [`Model`] and [`Provider`] (typically +/// `registry.models()` and `registry.providers()`). /// /// Filter order (each step records the unmet predicate when it eliminates /// the last candidate, so the error names the specific cause): /// 1. `required_capabilities` — every cap must be advertised /// 2. `arch_preference` — when non-empty, must match /// 3. `context_window_min` — model's window ≥ requirement -/// 4. `provider_policy` — Local/Cloud filter -/// 5. memory budget — local models with declared estimates only +/// 4. `provider_policy` — Local/Cloud filter, keyed on the provider's +/// [`ProviderKind`] (no hardcoded provider-id list — providers declare +/// their own residency in `providers.toml`) /// /// Returns the first survivor under the policy's ranking. `PreferLocal` /// puts local providers first; `PreferCloud` puts cloud providers first; /// other policies preserve registry order. -pub fn resolve_model<'a, I>( +pub fn resolve_model<'a, M, P>( requirement: &ModelRequirement, - models: I, + models: M, + providers: P, ) -> Result where - I: IntoIterator, + M: IntoIterator, + P: IntoIterator, { + let provider_kinds: HashMap<&str, ProviderKind> = providers + .into_iter() + .map(|p| (p.id.as_str(), p.kind)) + .collect(); + let is_local = |provider_id: &str| { + provider_kinds.get(provider_id).copied().unwrap_or_default() == ProviderKind::Local + }; + let registry: Vec<&Model> = models.into_iter().collect(); let registry_count = registry.len(); let mut unmet: Vec = Vec::new(); @@ -266,12 +270,7 @@ where let mut candidates: Vec<&Model> = registry .iter() .copied() - .filter(|m| { - requirement - .required_capabilities - .iter() - .all(|c| m.has(*c)) - }) + .filter(|m| requirement.required_capabilities.iter().all(|c| m.has(*c))) .collect(); if candidates.is_empty() && !requirement.required_capabilities.is_empty() { unmet.push(format!( @@ -326,8 +325,8 @@ where // Filter 4: provider policy. let before_provider = candidates.len(); candidates.retain(|m| match requirement.provider_policy { - LocalOrCloudPolicy::LocalOnly => is_local_provider(&m.provider), - LocalOrCloudPolicy::CloudOnly => !is_local_provider(&m.provider), + LocalOrCloudPolicy::LocalOnly => is_local(&m.provider), + LocalOrCloudPolicy::CloudOnly => !is_local(&m.provider), LocalOrCloudPolicy::PreferLocal | LocalOrCloudPolicy::PreferCloud | LocalOrCloudPolicy::Any => true, @@ -347,16 +346,16 @@ where // Rank: PreferLocal/PreferCloud reorder; other policies preserve order. match requirement.provider_policy { LocalOrCloudPolicy::PreferLocal => { - candidates.sort_by_key(|m| u8::from(!is_local_provider(&m.provider))); + candidates.sort_by_key(|m| u8::from(!is_local(&m.provider))); } LocalOrCloudPolicy::PreferCloud => { - candidates.sort_by_key(|m| u8::from(is_local_provider(&m.provider))); + candidates.sort_by_key(|m| u8::from(is_local(&m.provider))); } _ => {} } let best = candidates.first().expect("non-empty after filters"); - let target_silicon = derive_target_silicon(best, &requirement.host); + let target_silicon = derive_target_silicon(best, &provider_kinds, &requirement.host); let reason = format!( "matched {} required capability(ies) on arch={:?}, context={}, provider={}, policy={:?}", requirement.required_capabilities.len(), @@ -383,7 +382,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::model_registry::types::MultiPartyChatStrategy; + use crate::model_registry::types::{AuthKind, MultiPartyChatStrategy}; fn make_model( id: &str, @@ -412,6 +411,27 @@ mod tests { } } + fn make_provider(id: &str, kind: ProviderKind) -> Provider { + Provider { + id: id.into(), + name: None, + base_url: "http://test".into(), + api_key_env: None, + default_model: None, + auth: AuthKind::None, + model_prefixes: vec![], + kind, + } + } + + fn providers() -> Vec { + vec![ + make_provider("anthropic", ProviderKind::Cloud), + make_provider("openai", ProviderKind::Cloud), + make_provider("llamacpp-local", ProviderKind::Local), + ] + } + fn host_m1_8gb() -> HostCapability { HostCapability { hw_capability_tier: HwCapabilityTier::M1Uma8Gb, @@ -501,7 +521,6 @@ mod tests { required_capabilities: [Capability::Chat].iter().copied().collect(), arch_preference: vec![], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::LocalOnly, host, } @@ -515,7 +534,6 @@ mod tests { .collect(), arch_preference: vec![], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::LocalOnly, host, } @@ -524,12 +542,12 @@ mod tests { #[test] fn local_chat_resolves_to_qwen35_on_m1() { let r = registry(); - let resolved = resolve_model(&req_chat_local(host_m1_8gb()), r.iter()).unwrap(); + let resolved = + resolve_model(&req_chat_local(host_m1_8gb()), r.iter(), providers().iter()).unwrap(); assert_eq!(resolved.provider_id, "llamacpp-local"); - assert!( - resolved.model_id.starts_with("continuum-ai/qwen3.5") || resolved.model_id.starts_with("qwen2"), - "expected a local qwen model, got {}", + assert_eq!( resolved.model_id, + "continuum-ai/qwen3.5-4b-code-forged-GGUF" ); assert_eq!(resolved.target_silicon, TargetSilicon::UnifiedMemory); assert_eq!(resolved.hw_capability_tier, HwCapabilityTier::M1Uma8Gb); @@ -538,7 +556,12 @@ mod tests { #[test] fn vision_request_resolves_to_qwen2_vl() { let r = registry(); - let resolved = resolve_model(&req_vision_local(host_rtx5090()), r.iter()).unwrap(); + let resolved = resolve_model( + &req_vision_local(host_rtx5090()), + r.iter(), + providers().iter(), + ) + .unwrap(); assert_eq!(resolved.model_id, "qwen2-vl-7b-instruct"); assert_eq!(resolved.provider_id, "llamacpp-local"); assert_eq!(resolved.target_silicon, TargetSilicon::Gpu); @@ -550,7 +573,7 @@ mod tests { let r = registry(); let mut req = req_chat_local(host_rtx5090()); req.provider_policy = LocalOrCloudPolicy::CloudOnly; - let resolved = resolve_model(&req, r.iter()).unwrap(); + let resolved = resolve_model(&req, r.iter(), providers().iter()).unwrap(); assert!( ["anthropic", "openai"].contains(&resolved.provider_id.as_str()), "expected cloud provider, got {}", @@ -566,11 +589,10 @@ mod tests { required_capabilities: [Capability::ImageGeneration].iter().copied().collect(), arch_preference: vec![], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::Any, host: host_rtx5090(), }; - let err = resolve_model(&req, r.iter()).unwrap_err(); + let err = resolve_model(&req, r.iter(), providers().iter()).unwrap_err(); let ResolutionError::NoModelMatchesRequirement { registry_count, candidates_after_filter, @@ -592,7 +614,12 @@ mod tests { // run it). The resolver answers "what fits the requirement," // not "what will succeed at inference time." let r = registry(); - let resolved = resolve_model(&req_vision_local(host_cpu_only()), r.iter()).unwrap(); + let resolved = resolve_model( + &req_vision_local(host_cpu_only()), + r.iter(), + providers().iter(), + ) + .unwrap(); assert_eq!(resolved.model_id, "qwen2-vl-7b-instruct"); assert_eq!(resolved.target_silicon, TargetSilicon::Cpu); assert_eq!(resolved.hw_capability_tier, HwCapabilityTier::CpuOnly); @@ -605,13 +632,15 @@ mod tests { required_capabilities: [Capability::Chat].iter().copied().collect(), arch_preference: vec![], context_window_min: 100_000, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::LocalOnly, host: host_rtx5090(), }; - let resolved = resolve_model(&req, r.iter()).unwrap(); + let resolved = resolve_model(&req, r.iter(), providers().iter()).unwrap(); // Only qwen3.5-4b (262144 ctx) survives among local with ≥100k window. - assert_eq!(resolved.model_id, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!( + resolved.model_id, + "continuum-ai/qwen3.5-4b-code-forged-GGUF" + ); } #[test] @@ -621,12 +650,14 @@ mod tests { required_capabilities: [Capability::Chat].iter().copied().collect(), arch_preference: vec![Arch::Qwen35], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::Any, host: host_rtx5090(), }; - let resolved = resolve_model(&req, r.iter()).unwrap(); - assert_eq!(resolved.model_id, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + let resolved = resolve_model(&req, r.iter(), providers().iter()).unwrap(); + assert_eq!( + resolved.model_id, + "continuum-ai/qwen3.5-4b-code-forged-GGUF" + ); } #[test] @@ -639,11 +670,10 @@ mod tests { .collect(), arch_preference: vec![], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::PreferLocal, host: host_rtx5090(), }; - let resolved = resolve_model(&req, r.iter()).unwrap(); + let resolved = resolve_model(&req, r.iter(), providers().iter()).unwrap(); assert_eq!(resolved.provider_id, "llamacpp-local"); assert_eq!(resolved.model_id, "qwen2-vl-7b-instruct"); } @@ -658,11 +688,10 @@ mod tests { .collect(), arch_preference: vec![], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::PreferCloud, host: host_rtx5090(), }; - let resolved = resolve_model(&req, r.iter()).unwrap(); + let resolved = resolve_model(&req, r.iter(), providers().iter()).unwrap(); assert!( ["anthropic", "openai"].contains(&resolved.provider_id.as_str()), "expected cloud first, got {}", @@ -670,6 +699,47 @@ mod tests { ); } + #[test] + fn provider_kind_drives_local_classification_not_id() { + // Confirms the LOCAL_PROVIDER_IDS hardcoding is gone — Provider's + // kind field is what decides Local vs Cloud. Construct a custom + // provider whose id has nothing to do with the old hardcoded set. + let models = vec![make_model( + "custom-local-model", + "custom-local-provider", + Arch::Llama, + 8192, + &[Capability::Chat], + )]; + let providers = vec![make_provider("custom-local-provider", ProviderKind::Local)]; + let req = req_chat_local(host_m1_8gb()); + let resolved = resolve_model(&req, models.iter(), providers.iter()).unwrap(); + assert_eq!(resolved.model_id, "custom-local-model"); + assert_eq!(resolved.target_silicon, TargetSilicon::UnifiedMemory); + } + + #[test] + fn unknown_provider_defaults_to_cloud_for_safety() { + // If a model references a provider id that isn't in the providers + // table at all, the resolver treats it as Cloud (default kind). + // This is loud: a LocalOnly query will reject the model rather + // than silently routing unknown-residency work to local hardware. + let models = vec![make_model( + "orphan-model", + "orphan-provider", + Arch::Llama, + 8192, + &[Capability::Chat], + )]; + let providers: Vec = vec![]; + let req = req_chat_local(host_m1_8gb()); + let err = resolve_model(&req, models.iter(), providers.iter()).unwrap_err(); + assert!( + matches!(err, ResolutionError::NoModelMatchesRequirement { .. }), + "LocalOnly with unknown provider must error, not silently treat as local" + ); + } + #[test] fn five_persona_resolution_smoke() { // Lane C contract test: 5 personas with different needs all @@ -677,17 +747,23 @@ mod tests { let r = registry(); // Persona 1: Helper AI — local chat. - let helper = resolve_model(&req_chat_local(host_m1_8gb()), r.iter()).unwrap(); + let helper = + resolve_model(&req_chat_local(host_m1_8gb()), r.iter(), providers().iter()).unwrap(); assert_eq!(helper.provider_id, "llamacpp-local"); // Persona 2: Vision AI — local vision. - let vision = resolve_model(&req_vision_local(host_m1_8gb()), r.iter()).unwrap(); + let vision = resolve_model( + &req_vision_local(host_m1_8gb()), + r.iter(), + providers().iter(), + ) + .unwrap(); assert_eq!(vision.model_id, "qwen2-vl-7b-instruct"); // Persona 3: Cloud-only persona — wants vision via cloud. let mut cloud_vision_req = req_vision_local(host_m1_8gb()); cloud_vision_req.provider_policy = LocalOrCloudPolicy::CloudOnly; - let cloud_vision = resolve_model(&cloud_vision_req, r.iter()).unwrap(); + let cloud_vision = resolve_model(&cloud_vision_req, r.iter(), providers().iter()).unwrap(); assert!( ["anthropic", "openai"].contains(&cloud_vision.provider_id.as_str()), "expected cloud, got {}", @@ -702,7 +778,7 @@ mod tests { .copied() .collect(); audio_req.provider_policy = LocalOrCloudPolicy::Any; - let audio = resolve_model(&audio_req, r.iter()).unwrap(); + let audio = resolve_model(&audio_req, r.iter(), providers().iter()).unwrap(); assert_eq!(audio.model_id, "gpt-4o"); // Persona 5: Code persona requiring tool-use — qwen3.5 OR claude. @@ -712,7 +788,7 @@ mod tests { .copied() .collect(); code_req.provider_policy = LocalOrCloudPolicy::PreferLocal; - let code = resolve_model(&code_req, r.iter()).unwrap(); + let code = resolve_model(&code_req, r.iter(), providers().iter()).unwrap(); assert_eq!(code.provider_id, "llamacpp-local"); assert_eq!(code.model_id, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); @@ -723,13 +799,12 @@ mod tests { required_capabilities: [Capability::ImageGeneration].iter().copied().collect(), arch_preference: vec![], context_window_min: 0, - memory_budget_mb: None, provider_policy: LocalOrCloudPolicy::Any, host: host_rtx5090(), }; assert!( matches!( - resolve_model(&img_req, r.iter()), + resolve_model(&img_req, r.iter(), providers().iter()), Err(ResolutionError::NoModelMatchesRequirement { .. }) ), "missing capability must error, not fall back" diff --git a/src/workers/continuum-core/src/model_registry/types.rs b/src/workers/continuum-core/src/model_registry/types.rs index 33aa1376c..127462592 100644 --- a/src/workers/continuum-core/src/model_registry/types.rs +++ b/src/workers/continuum-core/src/model_registry/types.rs @@ -19,10 +19,7 @@ use std::path::PathBuf; #[derive( Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, ts_rs::TS, )] -#[ts( - export, - export_to = "../../../shared/generated/model_registry/Arch.ts" -)] +#[ts(export, export_to = "../../../shared/generated/model_registry/Arch.ts")] #[serde(rename_all = "snake_case")] pub enum Arch { Qwen2, @@ -85,6 +82,41 @@ pub enum Capability { Reranking, } +/// Where a provider runs its inference. Resolver consumes this to honor +/// `LocalOrCloudPolicy` without needing a hardcoded provider-id list. +/// Providers default to [`ProviderKind::Cloud`] so adding a new cloud +/// provider TOML row doesn't require an explicit `kind` line; local +/// providers MUST declare `kind = "local"` explicitly. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + Default, + Serialize, + Deserialize, + ts_rs::TS, +)] +#[ts( + export, + export_to = "../../../shared/generated/model_registry/ProviderKind.ts" +)] +#[serde(rename_all = "snake_case")] +pub enum ProviderKind { + /// In-process or localhost backend. Inference runs on this host's + /// hardware (CPU / GPU / unified memory). Examples: `llamacpp-local`, + /// `docker-model-runner`. + Local, + /// Remote HTTP API. Inference runs off-host; this provider counts + /// toward `TargetSilicon::Cloud` admission. Default for new providers. + #[default] + Cloud, +} + /// HTTP authentication mode for a provider's API. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] @@ -286,6 +318,12 @@ pub struct Provider { /// dispatch via live /v1/models probes instead. #[serde(default)] pub model_prefixes: Vec, + /// Where this provider runs inference. See [`ProviderKind`]. Defaults + /// to `Cloud` when omitted in TOML — local providers must declare + /// `kind = "local"` explicitly so adding a new cloud provider doesn't + /// require touching this field. + #[serde(default)] + pub kind: ProviderKind, } impl Provider {