Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/commands/ai/should-respond/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ PersonaUser.shouldRespondToMessage()
ChatRAGBuilder (reuse existing RAG assembly)
ai/generate (llama3.2:3b with gating prompt)
ai/generate (local Qwen with gating prompt)
Parse JSON response:
{
Expand Down Expand Up @@ -136,15 +136,15 @@ You are a conversation coordinator for a multi-party chat room.
- ✅ Explainable decisions (logs show reasoning)

**vs Expensive Model for Every Decision:**
- ✅ Use **llama3.2:3b** (2GB, fast, free)
- ✅ Use the local Qwen gating/default model (fast, free, Rust-admitted)
- ✅ Simple YES/NO decision (low temperature, 200 tokens)
- ✅ ~1-2 seconds per decision
- ✅ **Fail-safe fallback** to simple heuristics if AI unavailable

### Cost Analysis

**Current Problem**: All 3 personas generate full responses (12+ messages)
- 12 × llama3.2:3b calls = 12 × ~5 seconds = **60 seconds total**
- 12 × local model calls = 12 × ~5 seconds = **60 seconds total**
- 12 × 150 tokens = **1,800 tokens wasted**

**With AI Gating**:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ export class AIShouldRespondServerCommand extends AIShouldRespondCommand {
...markedHistory, // Conversation with trigger message marked
{ role: 'user', content: gatingInstruction }
],
model: params.model ?? LOCAL_MODELS.DEFAULT, // Candle uses pre-loaded model
model: params.model ?? LOCAL_MODELS.DEFAULT,
temperature: 0.3,
maxTokens: 200,
provider: 'candle'
provider: 'local'
};

const response = await AIProviderDaemon.generateText(request);
Expand All @@ -65,26 +65,26 @@ export class AIShouldRespondServerCommand extends AIShouldRespondCommand {

// If parsing failed (confidence = 0.0 means parse error), retry with better model to fix JSON
if (parsed.confidence === 0.0 && parsed.reason === 'Failed to parse AI response') {
console.warn(`⚠️ Gating JSON parse failed with ${request.model}, retrying with Candle to fix malformed JSON`);
console.warn(`⚠️ Gating JSON parse failed with ${request.model}, retrying with local Qwen to fix malformed JSON`);

const fixRequest: TextGenerationRequest = {
messages: [
{ role: 'system', content: 'You are a JSON repair tool. Fix malformed JSON and return valid JSON only.' },
{ role: 'user', content: `This JSON is malformed:\n\n${response.text}\n\nFix it and return ONLY valid JSON with this exact structure:\n{\n "shouldRespond": true/false,\n "confidence": 0.0-1.0,\n "reason": "string",\n "factors": {\n "mentioned": true/false,\n "questionAsked": true/false,\n "domainRelevant": true/false,\n "recentlySpoke": true/false,\n "othersAnswered": true/false\n }\n}` }
],
model: LOCAL_MODELS.DEFAULT, // Candle uses pre-loaded model
model: LOCAL_MODELS.DEFAULT,
temperature: 0.1, // Low temp for structured output
maxTokens: 200,
provider: 'candle'
provider: 'local'
};

const fixedResponse = await AIProviderDaemon.generateText(fixRequest);
if (fixedResponse.text) {
parsed = this.parseGatingResponse(fixedResponse.text);
if (parsed.confidence !== 0.0) {
console.log(`✅ JSON repair succeeded with Candle`);
console.log(`✅ JSON repair succeeded with local Qwen`);
} else {
throw new Error(`JSON repair failed even with Candle. Original: ${response.text.slice(0, 200)}`);
throw new Error(`JSON repair failed even with local Qwen. Original: ${response.text.slice(0, 200)}`);
}
} else {
throw new Error(`JSON repair request failed: ${fixedResponse.error}`);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
*
* Sentinel/Coordinator pattern: Use AI to intelligently gate persona responses
*
* Uses llama3.2:3b (validated, fast, cheap) to analyze full conversation context
* Uses the local Qwen gating model to analyze full conversation context
* and decide if a persona should respond to a message.
*/

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ export interface AIShouldRespondParams extends CommandParams {
/** Detection strategy (default: 'fast') */
readonly strategy?: ResponseStrategy;

/** Optional: Override model (defaults to llama3.2:3b for LLM strategy) */
/** Optional: Override model (defaults to LOCAL_MODELS.DEFAULT for LLM strategy) */
readonly model?: string;

/** Verbose mode - include full RAG context and prompt in response */
Expand Down Expand Up @@ -159,4 +159,3 @@ export const createAiShouldRespondResultFromParams = (
params: AIShouldRespondParams,
differences: Omit<AIShouldRespondResult, 'context' | 'sessionId' | 'userId'>
): AIShouldRespondResult => transformPayload(params, differences);

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import type { ICommandDaemon } from '../../../../daemons/command-daemon/shared/C
import type { AIValidateResponseParams, AIValidateResponseResult, ResponseDecision } from '../shared/AIValidateResponseTypes';
import { AIProviderDaemon } from '../../../../daemons/ai-provider-daemon/shared/AIProviderDaemon';
import type { TextGenerationRequest } from '../../../../daemons/ai-provider-daemon/shared/AIProviderTypesV2';
import { LOCAL_MODELS } from '../../../../system/shared/Constants';

export class AIValidateResponseServerCommand extends CommandBase<AIValidateResponseParams, AIValidateResponseResult> {
constructor(context: JTAGContext, subpath: string, commander: ICommandDaemon) {
Expand All @@ -27,10 +28,10 @@ export class AIValidateResponseServerCommand extends CommandBase<AIValidateRespo
{ role: 'system', content: 'You are a response validator. Reply ONLY with one word: SUBMIT, CLARIFY, or SILENT.' },
{ role: 'user', content: validationPrompt }
],
model: params.model ?? 'llama3.2:3b',
model: params.model ?? LOCAL_MODELS.GATING,
temperature: 0.1, // Low temp for consistent decisions
maxTokens: 10, // Just need one word
provider: 'candle'
provider: 'local'
};

const response = await AIProviderDaemon.generateText(request);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ export interface AIValidateResponseParams extends CommandParams {
/** Optional: Conversation context for better evaluation */
readonly conversationContext?: string;

/** Optional: Override model (defaults to llama3.2:3b) */
/** Optional: Override model (defaults to LOCAL_MODELS.GATING) */
readonly model?: string;

/** Verbose mode - include prompt and AI reasoning */
Expand Down Expand Up @@ -109,4 +109,3 @@ export const createAiValidateResponseResultFromParams = (
params: AIValidateResponseParams,
differences: Omit<AIValidateResponseResult, 'context' | 'sessionId' | 'userId'>
): AIValidateResponseResult => transformPayload(params, differences);

38 changes: 38 additions & 0 deletions src/system/shared/Constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,50 @@ export const LOCAL_MODELS = {
'qwen2.5': 'Qwen/Qwen2.5-7B-Instruct',
} as const,

/**
* Removed local runtime aliases.
*
* These used to route persona/chat inference through ad hoc llama/Candle
* paths. Local persona inference is now Qwen + Rust admission only. Fail
* loudly so stale DB rows or command params do not silently pick the wrong
* model/provider and burn CPU.
*/
REMOVED_LOCAL_ALIASES: {
'llama3': 'qwen3.5',
'llama3:8b': 'qwen3.5',
'llama3.1': 'qwen3.5',
'llama3.1:8b': 'qwen3.5',
'llama3.2': 'qwen3.5',
'llama3.2:1b': 'qwen2',
'llama3.2:3b': 'qwen3.5',
'phi3': 'qwen2',
'phi3:mini': 'qwen2',
'tinyllama': 'qwen2',
'smollm2': 'qwen2',
'codellama': 'qwen3.5-code',
} as const,

/**
* Map a model name to HuggingFace ID
* Returns original if not found (might already be a HuggingFace ID)
*/
mapToHuggingFace(modelName: string): string {
const normalized = modelName.toLowerCase().trim();
const mapping = LOCAL_MODELS.LEGACY_TO_HUGGINGFACE as Record<string, string>;
const removedAliases = LOCAL_MODELS.REMOVED_LOCAL_ALIASES as Record<string, string>;

const assertNotRemoved = (candidate: string): void => {
const replacement = removedAliases[candidate];
if (replacement) {
throw new Error(
`Local model alias '${modelName}' was removed from the runtime. ` +
`Continuum local chat uses Qwen through Rust/llama.cpp admission only. ` +
`Use '${replacement}' or LOCAL_MODELS.DEFAULT instead.`
);
}
};

assertNotRemoved(normalized);

// Direct lookup
if (mapping[normalized]) {
Expand All @@ -214,6 +251,7 @@ export const LOCAL_MODELS = {

// Try without version suffix (e.g., 'qwen3.5:4b-instruct' -> 'qwen3.5:4b')
const withoutSuffix = normalized.replace(/-instruct.*$|-chat.*$|-q\d+.*$/i, '');
assertNotRemoved(withoutSuffix);
if (mapping[withoutSuffix]) {
return mapping[withoutSuffix];
}
Expand Down
4 changes: 3 additions & 1 deletion src/system/user/server/PersonaLifecycleManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import { Events } from '../../core/shared/Events';
import { Commands } from '../../core/shared/Commands';
import type { CommandParams } from '../../core/types/JTAGTypes';
import { SecretManager } from '../../secrets/SecretManager';

interface KeyChangeEvent {
provider: string;
Expand Down Expand Up @@ -293,6 +294,7 @@ export class PersonaLifecycleManager {
'SENTINEL_PATH',
];

return knownKeyVars.filter(key => !!process.env[key]);
const secrets = SecretManager.getInstance();
return knownKeyVars.filter(key => Boolean(secrets.get(key, 'PersonaLifecycleManager.collectAvailableApiKeys')));
}
}
26 changes: 26 additions & 0 deletions src/tests/unit/local-model-guardrails.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { describe, expect, it } from 'vitest';
import { LOCAL_MODELS } from '@system/shared/Constants';

describe('LOCAL_MODELS guardrails', () => {
it('keeps accepted Qwen aliases mapped through the local runtime source of truth', () => {
expect(LOCAL_MODELS.mapToHuggingFace('qwen3.5')).toBe(LOCAL_MODELS.DEFAULT);
expect(LOCAL_MODELS.mapToHuggingFace('qwen3.5:4b')).toBe(LOCAL_MODELS.DEFAULT);
expect(LOCAL_MODELS.mapToHuggingFace('qwen2-vl')).toBe(LOCAL_MODELS.VISION);
});

it('rejects removed local aliases instead of silently routing stale llama/Candle configs', () => {
for (const alias of Object.keys(LOCAL_MODELS.REMOVED_LOCAL_ALIASES)) {
expect(() => LOCAL_MODELS.mapToHuggingFace(alias)).toThrow(/was removed from the runtime/);
}
});

it('rejects removed aliases even when callers append an instruction or quant suffix', () => {
expect(() => LOCAL_MODELS.mapToHuggingFace('llama3.2:3b-instruct')).toThrow(/Use 'qwen3.5'/);
expect(() => LOCAL_MODELS.mapToHuggingFace('phi3:mini-q4_k_m')).toThrow(/Use 'qwen2'/);
});

it('still accepts explicit HuggingFace ids for registry/catalog entries', () => {
const rawModel = 'Qwen/Qwen2.5-7B-Instruct';
expect(LOCAL_MODELS.mapToHuggingFace(rawModel)).toBe(rawModel);
});
});
Loading