CambrianTech · joelteply · May 8, 2026 · May 8, 2026
diff --git a/src/commands/ai/should-respond/README.md b/src/commands/ai/should-respond/README.md
@@ -23,7 +23,7 @@ PersonaUser.shouldRespondToMessage()
        ↓
 ChatRAGBuilder (reuse existing RAG assembly)
        ↓
-ai/generate (llama3.2:3b with gating prompt)
+ai/generate (local Qwen with gating prompt)
        ↓
 Parse JSON response:
    {
@@ -136,15 +136,15 @@ You are a conversation coordinator for a multi-party chat room.
 - ✅ Explainable decisions (logs show reasoning)
 
 **vs Expensive Model for Every Decision:**
-- ✅ Use **llama3.2:3b** (2GB, fast, free)
+- ✅ Use the local Qwen gating/default model (fast, free, Rust-admitted)
 - ✅ Simple YES/NO decision (low temperature, 200 tokens)
 - ✅ ~1-2 seconds per decision
 - ✅ **Fail-safe fallback** to simple heuristics if AI unavailable
 
 ### Cost Analysis
 
 **Current Problem**: All 3 personas generate full responses (12+ messages)
-- 12 × llama3.2:3b calls = 12 × ~5 seconds = **60 seconds total**
+- 12 × local model calls = 12 × ~5 seconds = **60 seconds total**
 - 12 × 150 tokens = **1,800 tokens wasted**
 
 **With AI Gating**:

diff --git a/src/commands/ai/should-respond/server/AIShouldRespondServerCommand.ts b/src/commands/ai/should-respond/server/AIShouldRespondServerCommand.ts
@@ -48,10 +48,10 @@ export class AIShouldRespondServerCommand extends AIShouldRespondCommand {
           ...markedHistory,  // Conversation with trigger message marked
           { role: 'user', content: gatingInstruction }
         ],
-        model: params.model ?? LOCAL_MODELS.DEFAULT,  // Candle uses pre-loaded model
+        model: params.model ?? LOCAL_MODELS.DEFAULT,
         temperature: 0.3,
         maxTokens: 200,
-        provider: 'candle'
+        provider: 'local'
       };
 
       const response = await AIProviderDaemon.generateText(request);
@@ -65,26 +65,26 @@ export class AIShouldRespondServerCommand extends AIShouldRespondCommand {
 
       // If parsing failed (confidence = 0.0 means parse error), retry with better model to fix JSON
       if (parsed.confidence === 0.0 && parsed.reason === 'Failed to parse AI response') {
-        console.warn(`⚠️ Gating JSON parse failed with ${request.model}, retrying with Candle to fix malformed JSON`);
+        console.warn(`⚠️ Gating JSON parse failed with ${request.model}, retrying with local Qwen to fix malformed JSON`);
 
         const fixRequest: TextGenerationRequest = {
           messages: [
             { role: 'system', content: 'You are a JSON repair tool. Fix malformed JSON and return valid JSON only.' },
             { role: 'user', content: `This JSON is malformed:\n\n${response.text}\n\nFix it and return ONLY valid JSON with this exact structure:\n{\n  "shouldRespond": true/false,\n  "confidence": 0.0-1.0,\n  "reason": "string",\n  "factors": {\n    "mentioned": true/false,\n    "questionAsked": true/false,\n    "domainRelevant": true/false,\n    "recentlySpoke": true/false,\n    "othersAnswered": true/false\n  }\n}` }
           ],
-          model: LOCAL_MODELS.DEFAULT,  // Candle uses pre-loaded model
+          model: LOCAL_MODELS.DEFAULT,
           temperature: 0.1,  // Low temp for structured output
           maxTokens: 200,
-          provider: 'candle'
+          provider: 'local'
         };
 
         const fixedResponse = await AIProviderDaemon.generateText(fixRequest);
         if (fixedResponse.text) {
           parsed = this.parseGatingResponse(fixedResponse.text);
           if (parsed.confidence !== 0.0) {
-            console.log(`✅ JSON repair succeeded with Candle`);
+            console.log(`✅ JSON repair succeeded with local Qwen`);
           } else {
-            throw new Error(`JSON repair failed even with Candle. Original: ${response.text.slice(0, 200)}`);
+            throw new Error(`JSON repair failed even with local Qwen. Original: ${response.text.slice(0, 200)}`);
           }
         } else {
           throw new Error(`JSON repair request failed: ${fixedResponse.error}`);

diff --git a/src/commands/ai/should-respond/shared/AIShouldRespondCommand.ts b/src/commands/ai/should-respond/shared/AIShouldRespondCommand.ts
@@ -3,7 +3,7 @@
  *
  * Sentinel/Coordinator pattern: Use AI to intelligently gate persona responses
  *
- * Uses llama3.2:3b (validated, fast, cheap) to analyze full conversation context
+ * Uses the local Qwen gating model to analyze full conversation context
  * and decide if a persona should respond to a message.
  */
 

diff --git a/src/commands/ai/should-respond/shared/AIShouldRespondTypes.ts b/src/commands/ai/should-respond/shared/AIShouldRespondTypes.ts
@@ -46,7 +46,7 @@ export interface AIShouldRespondParams extends CommandParams {
   /** Detection strategy (default: 'fast') */
   readonly strategy?: ResponseStrategy;
 
-  /** Optional: Override model (defaults to llama3.2:3b for LLM strategy) */
+  /** Optional: Override model (defaults to LOCAL_MODELS.DEFAULT for LLM strategy) */
   readonly model?: string;
 
   /** Verbose mode - include full RAG context and prompt in response */
@@ -159,4 +159,3 @@ export const createAiShouldRespondResultFromParams = (
   params: AIShouldRespondParams,
   differences: Omit<AIShouldRespondResult, 'context' | 'sessionId' | 'userId'>
 ): AIShouldRespondResult => transformPayload(params, differences);
-
diff --git a/src/commands/ai/validate-response/server/AIValidateResponseServerCommand.ts b/src/commands/ai/validate-response/server/AIValidateResponseServerCommand.ts
@@ -11,6 +11,7 @@ import type { ICommandDaemon } from '../../../../daemons/command-daemon/shared/C
 import type { AIValidateResponseParams, AIValidateResponseResult, ResponseDecision } from '../shared/AIValidateResponseTypes';
 import { AIProviderDaemon } from '../../../../daemons/ai-provider-daemon/shared/AIProviderDaemon';
 import type { TextGenerationRequest } from '../../../../daemons/ai-provider-daemon/shared/AIProviderTypesV2';
+import { LOCAL_MODELS } from '../../../../system/shared/Constants';
 
 export class AIValidateResponseServerCommand extends CommandBase<AIValidateResponseParams, AIValidateResponseResult> {
   constructor(context: JTAGContext, subpath: string, commander: ICommandDaemon) {
@@ -27,10 +28,10 @@ export class AIValidateResponseServerCommand extends CommandBase<AIValidateRespo
         { role: 'system', content: 'You are a response validator. Reply ONLY with one word: SUBMIT, CLARIFY, or SILENT.' },
         { role: 'user', content: validationPrompt }
       ],
-      model: params.model ?? 'llama3.2:3b',
+      model: params.model ?? LOCAL_MODELS.GATING,
       temperature: 0.1,  // Low temp for consistent decisions
       maxTokens: 10,     // Just need one word
-      provider: 'candle'
+      provider: 'local'
     };
 
     const response = await AIProviderDaemon.generateText(request);

diff --git a/src/commands/ai/validate-response/shared/AIValidateResponseTypes.ts b/src/commands/ai/validate-response/shared/AIValidateResponseTypes.ts
@@ -33,7 +33,7 @@ export interface AIValidateResponseParams extends CommandParams {
   /** Optional: Conversation context for better evaluation */
   readonly conversationContext?: string;
 
-  /** Optional: Override model (defaults to llama3.2:3b) */
+  /** Optional: Override model (defaults to LOCAL_MODELS.GATING) */
   readonly model?: string;
 
   /** Verbose mode - include prompt and AI reasoning */
@@ -109,4 +109,3 @@ export const createAiValidateResponseResultFromParams = (
   params: AIValidateResponseParams,
   differences: Omit<AIValidateResponseResult, 'context' | 'sessionId' | 'userId'>
 ): AIValidateResponseResult => transformPayload(params, differences);
-
diff --git a/src/system/shared/Constants.ts b/src/system/shared/Constants.ts
@@ -199,13 +199,50 @@ export const LOCAL_MODELS = {
     'qwen2.5': 'Qwen/Qwen2.5-7B-Instruct',
   } as const,
 
+  /**
+   * Removed local runtime aliases.
+   *
+   * These used to route persona/chat inference through ad hoc llama/Candle
+   * paths. Local persona inference is now Qwen + Rust admission only. Fail
+   * loudly so stale DB rows or command params do not silently pick the wrong
+   * model/provider and burn CPU.
+   */
+  REMOVED_LOCAL_ALIASES: {
+    'llama3': 'qwen3.5',
+    'llama3:8b': 'qwen3.5',
+    'llama3.1': 'qwen3.5',
+    'llama3.1:8b': 'qwen3.5',
+    'llama3.2': 'qwen3.5',
+    'llama3.2:1b': 'qwen2',
+    'llama3.2:3b': 'qwen3.5',
+    'phi3': 'qwen2',
+    'phi3:mini': 'qwen2',
+    'tinyllama': 'qwen2',
+    'smollm2': 'qwen2',
+    'codellama': 'qwen3.5-code',
+  } as const,
+
   /**
    * Map a model name to HuggingFace ID
    * Returns original if not found (might already be a HuggingFace ID)
    */
   mapToHuggingFace(modelName: string): string {
     const normalized = modelName.toLowerCase().trim();
     const mapping = LOCAL_MODELS.LEGACY_TO_HUGGINGFACE as Record<string, string>;
+    const removedAliases = LOCAL_MODELS.REMOVED_LOCAL_ALIASES as Record<string, string>;
+
+    const assertNotRemoved = (candidate: string): void => {
+      const replacement = removedAliases[candidate];
+      if (replacement) {
+        throw new Error(
+          `Local model alias '${modelName}' was removed from the runtime. ` +
+          `Continuum local chat uses Qwen through Rust/llama.cpp admission only. ` +
+          `Use '${replacement}' or LOCAL_MODELS.DEFAULT instead.`
+        );
+      }
+    };
+
+    assertNotRemoved(normalized);
 
     // Direct lookup
     if (mapping[normalized]) {
@@ -214,6 +251,7 @@ export const LOCAL_MODELS = {
 
     // Try without version suffix (e.g., 'qwen3.5:4b-instruct' -> 'qwen3.5:4b')
     const withoutSuffix = normalized.replace(/-instruct.*$|-chat.*$|-q\d+.*$/i, '');
+    assertNotRemoved(withoutSuffix);
     if (mapping[withoutSuffix]) {
       return mapping[withoutSuffix];
     }

diff --git a/src/system/user/server/PersonaLifecycleManager.ts b/src/system/user/server/PersonaLifecycleManager.ts
@@ -12,6 +12,7 @@
 import { Events } from '../../core/shared/Events';
 import { Commands } from '../../core/shared/Commands';
 import type { CommandParams } from '../../core/types/JTAGTypes';
+import { SecretManager } from '../../secrets/SecretManager';
 
 interface KeyChangeEvent {
   provider: string;
@@ -293,6 +294,7 @@ export class PersonaLifecycleManager {
       'SENTINEL_PATH',
     ];
 
-    return knownKeyVars.filter(key => !!process.env[key]);
+    const secrets = SecretManager.getInstance();
+    return knownKeyVars.filter(key => Boolean(secrets.get(key, 'PersonaLifecycleManager.collectAvailableApiKeys')));
   }
 }
diff --git a/src/tests/unit/local-model-guardrails.test.ts b/src/tests/unit/local-model-guardrails.test.ts
@@ -0,0 +1,26 @@
+import { describe, expect, it } from 'vitest';
+import { LOCAL_MODELS } from '@system/shared/Constants';
+
+describe('LOCAL_MODELS guardrails', () => {
+  it('keeps accepted Qwen aliases mapped through the local runtime source of truth', () => {
+    expect(LOCAL_MODELS.mapToHuggingFace('qwen3.5')).toBe(LOCAL_MODELS.DEFAULT);
+    expect(LOCAL_MODELS.mapToHuggingFace('qwen3.5:4b')).toBe(LOCAL_MODELS.DEFAULT);
+    expect(LOCAL_MODELS.mapToHuggingFace('qwen2-vl')).toBe(LOCAL_MODELS.VISION);
+  });
+
+  it('rejects removed local aliases instead of silently routing stale llama/Candle configs', () => {
+    for (const alias of Object.keys(LOCAL_MODELS.REMOVED_LOCAL_ALIASES)) {
+      expect(() => LOCAL_MODELS.mapToHuggingFace(alias)).toThrow(/was removed from the runtime/);
+    }
+  });
+
+  it('rejects removed aliases even when callers append an instruction or quant suffix', () => {
+    expect(() => LOCAL_MODELS.mapToHuggingFace('llama3.2:3b-instruct')).toThrow(/Use 'qwen3.5'/);
+    expect(() => LOCAL_MODELS.mapToHuggingFace('phi3:mini-q4_k_m')).toThrow(/Use 'qwen2'/);
+  });
+
+  it('still accepts explicit HuggingFace ids for registry/catalog entries', () => {
+    const rawModel = 'Qwen/Qwen2.5-7B-Instruct';
+    expect(LOCAL_MODELS.mapToHuggingFace(rawModel)).toBe(rawModel);
+  });
+});