From 0e722a2eb48f0ddc930b053305972a3729c97a71 Mon Sep 17 00:00:00 2001 From: Test Date: Thu, 7 May 2026 19:23:42 -0500 Subject: [PATCH] Guard local model runtime boundaries --- src/commands/ai/should-respond/README.md | 6 +-- .../server/AIShouldRespondServerCommand.ts | 14 +++---- .../shared/AIShouldRespondCommand.ts | 2 +- .../shared/AIShouldRespondTypes.ts | 3 +- .../server/AIValidateResponseServerCommand.ts | 5 ++- .../shared/AIValidateResponseTypes.ts | 3 +- src/system/shared/Constants.ts | 38 +++++++++++++++++++ .../user/server/PersonaLifecycleManager.ts | 4 +- src/tests/unit/local-model-guardrails.test.ts | 26 +++++++++++++ 9 files changed, 83 insertions(+), 18 deletions(-) create mode 100644 src/tests/unit/local-model-guardrails.test.ts diff --git a/src/commands/ai/should-respond/README.md b/src/commands/ai/should-respond/README.md index 804538ffd..253d91a25 100644 --- a/src/commands/ai/should-respond/README.md +++ b/src/commands/ai/should-respond/README.md @@ -23,7 +23,7 @@ PersonaUser.shouldRespondToMessage() ↓ ChatRAGBuilder (reuse existing RAG assembly) ↓ -ai/generate (llama3.2:3b with gating prompt) +ai/generate (local Qwen with gating prompt) ↓ Parse JSON response: { @@ -136,7 +136,7 @@ You are a conversation coordinator for a multi-party chat room. - ✅ Explainable decisions (logs show reasoning) **vs Expensive Model for Every Decision:** -- ✅ Use **llama3.2:3b** (2GB, fast, free) +- ✅ Use the local Qwen gating/default model (fast, free, Rust-admitted) - ✅ Simple YES/NO decision (low temperature, 200 tokens) - ✅ ~1-2 seconds per decision - ✅ **Fail-safe fallback** to simple heuristics if AI unavailable @@ -144,7 +144,7 @@ You are a conversation coordinator for a multi-party chat room. ### Cost Analysis **Current Problem**: All 3 personas generate full responses (12+ messages) -- 12 × llama3.2:3b calls = 12 × ~5 seconds = **60 seconds total** +- 12 × local model calls = 12 × ~5 seconds = **60 seconds total** - 12 × 150 tokens = **1,800 tokens wasted** **With AI Gating**: diff --git a/src/commands/ai/should-respond/server/AIShouldRespondServerCommand.ts b/src/commands/ai/should-respond/server/AIShouldRespondServerCommand.ts index cfac7c7fd..b0b410d0f 100644 --- a/src/commands/ai/should-respond/server/AIShouldRespondServerCommand.ts +++ b/src/commands/ai/should-respond/server/AIShouldRespondServerCommand.ts @@ -48,10 +48,10 @@ export class AIShouldRespondServerCommand extends AIShouldRespondCommand { ...markedHistory, // Conversation with trigger message marked { role: 'user', content: gatingInstruction } ], - model: params.model ?? LOCAL_MODELS.DEFAULT, // Candle uses pre-loaded model + model: params.model ?? LOCAL_MODELS.DEFAULT, temperature: 0.3, maxTokens: 200, - provider: 'candle' + provider: 'local' }; const response = await AIProviderDaemon.generateText(request); @@ -65,26 +65,26 @@ export class AIShouldRespondServerCommand extends AIShouldRespondCommand { // If parsing failed (confidence = 0.0 means parse error), retry with better model to fix JSON if (parsed.confidence === 0.0 && parsed.reason === 'Failed to parse AI response') { - console.warn(`⚠️ Gating JSON parse failed with ${request.model}, retrying with Candle to fix malformed JSON`); + console.warn(`⚠️ Gating JSON parse failed with ${request.model}, retrying with local Qwen to fix malformed JSON`); const fixRequest: TextGenerationRequest = { messages: [ { role: 'system', content: 'You are a JSON repair tool. Fix malformed JSON and return valid JSON only.' }, { role: 'user', content: `This JSON is malformed:\n\n${response.text}\n\nFix it and return ONLY valid JSON with this exact structure:\n{\n "shouldRespond": true/false,\n "confidence": 0.0-1.0,\n "reason": "string",\n "factors": {\n "mentioned": true/false,\n "questionAsked": true/false,\n "domainRelevant": true/false,\n "recentlySpoke": true/false,\n "othersAnswered": true/false\n }\n}` } ], - model: LOCAL_MODELS.DEFAULT, // Candle uses pre-loaded model + model: LOCAL_MODELS.DEFAULT, temperature: 0.1, // Low temp for structured output maxTokens: 200, - provider: 'candle' + provider: 'local' }; const fixedResponse = await AIProviderDaemon.generateText(fixRequest); if (fixedResponse.text) { parsed = this.parseGatingResponse(fixedResponse.text); if (parsed.confidence !== 0.0) { - console.log(`✅ JSON repair succeeded with Candle`); + console.log(`✅ JSON repair succeeded with local Qwen`); } else { - throw new Error(`JSON repair failed even with Candle. Original: ${response.text.slice(0, 200)}`); + throw new Error(`JSON repair failed even with local Qwen. Original: ${response.text.slice(0, 200)}`); } } else { throw new Error(`JSON repair request failed: ${fixedResponse.error}`); diff --git a/src/commands/ai/should-respond/shared/AIShouldRespondCommand.ts b/src/commands/ai/should-respond/shared/AIShouldRespondCommand.ts index be38f3fb1..b5ea6dc71 100644 --- a/src/commands/ai/should-respond/shared/AIShouldRespondCommand.ts +++ b/src/commands/ai/should-respond/shared/AIShouldRespondCommand.ts @@ -3,7 +3,7 @@ * * Sentinel/Coordinator pattern: Use AI to intelligently gate persona responses * - * Uses llama3.2:3b (validated, fast, cheap) to analyze full conversation context + * Uses the local Qwen gating model to analyze full conversation context * and decide if a persona should respond to a message. */ diff --git a/src/commands/ai/should-respond/shared/AIShouldRespondTypes.ts b/src/commands/ai/should-respond/shared/AIShouldRespondTypes.ts index defc94520..2e2efa6c8 100644 --- a/src/commands/ai/should-respond/shared/AIShouldRespondTypes.ts +++ b/src/commands/ai/should-respond/shared/AIShouldRespondTypes.ts @@ -46,7 +46,7 @@ export interface AIShouldRespondParams extends CommandParams { /** Detection strategy (default: 'fast') */ readonly strategy?: ResponseStrategy; - /** Optional: Override model (defaults to llama3.2:3b for LLM strategy) */ + /** Optional: Override model (defaults to LOCAL_MODELS.DEFAULT for LLM strategy) */ readonly model?: string; /** Verbose mode - include full RAG context and prompt in response */ @@ -159,4 +159,3 @@ export const createAiShouldRespondResultFromParams = ( params: AIShouldRespondParams, differences: Omit ): AIShouldRespondResult => transformPayload(params, differences); - diff --git a/src/commands/ai/validate-response/server/AIValidateResponseServerCommand.ts b/src/commands/ai/validate-response/server/AIValidateResponseServerCommand.ts index bc96885a6..3c6c03cdb 100644 --- a/src/commands/ai/validate-response/server/AIValidateResponseServerCommand.ts +++ b/src/commands/ai/validate-response/server/AIValidateResponseServerCommand.ts @@ -11,6 +11,7 @@ import type { ICommandDaemon } from '../../../../daemons/command-daemon/shared/C import type { AIValidateResponseParams, AIValidateResponseResult, ResponseDecision } from '../shared/AIValidateResponseTypes'; import { AIProviderDaemon } from '../../../../daemons/ai-provider-daemon/shared/AIProviderDaemon'; import type { TextGenerationRequest } from '../../../../daemons/ai-provider-daemon/shared/AIProviderTypesV2'; +import { LOCAL_MODELS } from '../../../../system/shared/Constants'; export class AIValidateResponseServerCommand extends CommandBase { constructor(context: JTAGContext, subpath: string, commander: ICommandDaemon) { @@ -27,10 +28,10 @@ export class AIValidateResponseServerCommand extends CommandBase ): AIValidateResponseResult => transformPayload(params, differences); - diff --git a/src/system/shared/Constants.ts b/src/system/shared/Constants.ts index 60a7cc76e..153d52851 100644 --- a/src/system/shared/Constants.ts +++ b/src/system/shared/Constants.ts @@ -199,6 +199,29 @@ export const LOCAL_MODELS = { 'qwen2.5': 'Qwen/Qwen2.5-7B-Instruct', } as const, + /** + * Removed local runtime aliases. + * + * These used to route persona/chat inference through ad hoc llama/Candle + * paths. Local persona inference is now Qwen + Rust admission only. Fail + * loudly so stale DB rows or command params do not silently pick the wrong + * model/provider and burn CPU. + */ + REMOVED_LOCAL_ALIASES: { + 'llama3': 'qwen3.5', + 'llama3:8b': 'qwen3.5', + 'llama3.1': 'qwen3.5', + 'llama3.1:8b': 'qwen3.5', + 'llama3.2': 'qwen3.5', + 'llama3.2:1b': 'qwen2', + 'llama3.2:3b': 'qwen3.5', + 'phi3': 'qwen2', + 'phi3:mini': 'qwen2', + 'tinyllama': 'qwen2', + 'smollm2': 'qwen2', + 'codellama': 'qwen3.5-code', + } as const, + /** * Map a model name to HuggingFace ID * Returns original if not found (might already be a HuggingFace ID) @@ -206,6 +229,20 @@ export const LOCAL_MODELS = { mapToHuggingFace(modelName: string): string { const normalized = modelName.toLowerCase().trim(); const mapping = LOCAL_MODELS.LEGACY_TO_HUGGINGFACE as Record; + const removedAliases = LOCAL_MODELS.REMOVED_LOCAL_ALIASES as Record; + + const assertNotRemoved = (candidate: string): void => { + const replacement = removedAliases[candidate]; + if (replacement) { + throw new Error( + `Local model alias '${modelName}' was removed from the runtime. ` + + `Continuum local chat uses Qwen through Rust/llama.cpp admission only. ` + + `Use '${replacement}' or LOCAL_MODELS.DEFAULT instead.` + ); + } + }; + + assertNotRemoved(normalized); // Direct lookup if (mapping[normalized]) { @@ -214,6 +251,7 @@ export const LOCAL_MODELS = { // Try without version suffix (e.g., 'qwen3.5:4b-instruct' -> 'qwen3.5:4b') const withoutSuffix = normalized.replace(/-instruct.*$|-chat.*$|-q\d+.*$/i, ''); + assertNotRemoved(withoutSuffix); if (mapping[withoutSuffix]) { return mapping[withoutSuffix]; } diff --git a/src/system/user/server/PersonaLifecycleManager.ts b/src/system/user/server/PersonaLifecycleManager.ts index 16e35f336..1963c11f2 100644 --- a/src/system/user/server/PersonaLifecycleManager.ts +++ b/src/system/user/server/PersonaLifecycleManager.ts @@ -12,6 +12,7 @@ import { Events } from '../../core/shared/Events'; import { Commands } from '../../core/shared/Commands'; import type { CommandParams } from '../../core/types/JTAGTypes'; +import { SecretManager } from '../../secrets/SecretManager'; interface KeyChangeEvent { provider: string; @@ -293,6 +294,7 @@ export class PersonaLifecycleManager { 'SENTINEL_PATH', ]; - return knownKeyVars.filter(key => !!process.env[key]); + const secrets = SecretManager.getInstance(); + return knownKeyVars.filter(key => Boolean(secrets.get(key, 'PersonaLifecycleManager.collectAvailableApiKeys'))); } } diff --git a/src/tests/unit/local-model-guardrails.test.ts b/src/tests/unit/local-model-guardrails.test.ts new file mode 100644 index 000000000..816247c4f --- /dev/null +++ b/src/tests/unit/local-model-guardrails.test.ts @@ -0,0 +1,26 @@ +import { describe, expect, it } from 'vitest'; +import { LOCAL_MODELS } from '@system/shared/Constants'; + +describe('LOCAL_MODELS guardrails', () => { + it('keeps accepted Qwen aliases mapped through the local runtime source of truth', () => { + expect(LOCAL_MODELS.mapToHuggingFace('qwen3.5')).toBe(LOCAL_MODELS.DEFAULT); + expect(LOCAL_MODELS.mapToHuggingFace('qwen3.5:4b')).toBe(LOCAL_MODELS.DEFAULT); + expect(LOCAL_MODELS.mapToHuggingFace('qwen2-vl')).toBe(LOCAL_MODELS.VISION); + }); + + it('rejects removed local aliases instead of silently routing stale llama/Candle configs', () => { + for (const alias of Object.keys(LOCAL_MODELS.REMOVED_LOCAL_ALIASES)) { + expect(() => LOCAL_MODELS.mapToHuggingFace(alias)).toThrow(/was removed from the runtime/); + } + }); + + it('rejects removed aliases even when callers append an instruction or quant suffix', () => { + expect(() => LOCAL_MODELS.mapToHuggingFace('llama3.2:3b-instruct')).toThrow(/Use 'qwen3.5'/); + expect(() => LOCAL_MODELS.mapToHuggingFace('phi3:mini-q4_k_m')).toThrow(/Use 'qwen2'/); + }); + + it('still accepts explicit HuggingFace ids for registry/catalog entries', () => { + const rawModel = 'Qwen/Qwen2.5-7B-Instruct'; + expect(LOCAL_MODELS.mapToHuggingFace(rawModel)).toBe(rawModel); + }); +});