From e40c7c6092205f302496a0809b1898d398b17658 Mon Sep 17 00:00:00 2001 From: Test Date: Thu, 7 May 2026 15:46:24 -0500 Subject: [PATCH 1/2] Stabilize startup persona backpressure --- .../data/list/server/DataListServerCommand.ts | 22 +++- .../create/server/UserCreateServerCommand.ts | 25 ---- .../user-daemon/server/UserDaemonServer.ts | 36 ++++- src/scripts/launch-active-example.ts | 5 +- src/scripts/parallel-start.sh | 47 +++++-- src/scripts/seed-continuum.ts | 80 ++++++++++-- src/scripts/spawn-detached.mjs | 70 ++++++++++ .../BaseCoordinationStream.ts | 12 +- .../server/ChatCoordinationStream.ts | 2 +- .../core/system/server/ServiceInitializer.ts | 44 ++++--- src/system/data/entities/BaseEntity.ts | 54 +++++++- .../orchestration/SystemOrchestrator.ts | 13 +- .../user/server/PersonaLifecycleManager.ts | 20 +-- src/system/user/server/PersonaUser.ts | 56 ++++++-- .../server/modules/PersonaAutonomousLoop.ts | 5 + .../server/modules/PersonaMessageEvaluator.ts | 50 ++++++- .../modules/StartupAutonomousWorkGate.ts | 77 +++++++++++ .../unit/chat-coordination-stream.test.ts | 58 +++++++++ src/tests/unit/service-initializer.test.ts | 26 ++++ src/tests/unit/shared-node-boundary.test.ts | 86 ++++++++++++ .../unit/startup-autonomous-work-gate.test.ts | 48 +++++++ .../continuum-core/src/modules/channel.rs | 123 ++++++++++++++---- src/workers/continuum-core/src/orm/sqlite.rs | 70 ++++++++++ .../src/persona/self_task_generator.rs | 4 +- src/workers/start-workers.sh | 67 +++++++--- 25 files changed, 953 insertions(+), 147 deletions(-) create mode 100644 src/scripts/spawn-detached.mjs rename src/system/coordination/{shared => server}/BaseCoordinationStream.ts (97%) create mode 100644 src/system/user/server/modules/StartupAutonomousWorkGate.ts create mode 100644 src/tests/unit/chat-coordination-stream.test.ts create mode 100644 src/tests/unit/service-initializer.test.ts create mode 100644 src/tests/unit/shared-node-boundary.test.ts create mode 100644 src/tests/unit/startup-autonomous-work-gate.test.ts diff --git a/src/commands/data/list/server/DataListServerCommand.ts b/src/commands/data/list/server/DataListServerCommand.ts index ebb5d271d..dac3524ad 100644 --- a/src/commands/data/list/server/DataListServerCommand.ts +++ b/src/commands/data/list/server/DataListServerCommand.ts @@ -99,10 +99,22 @@ export class DataListServerCommand extends CommandBase { + if (Array.isArray(value)) { + const fields = value.filter((field): field is string => typeof field === 'string' && field.length > 0); + return fields.length > 0 ? fields : undefined; + } + if (typeof value === 'string' && value.length > 0) { + return value.split(',').map(field => field.trim()).filter(Boolean); + } + return undefined; + }; + const selectColumns = normalizeProjection(params.fields) ?? normalizeProjection(params.select); const storageQuery = { collection, @@ -190,4 +202,4 @@ export class DataListServerCommand extends CommandBase>(); /** * Get singleton instance (for genome commands to access PersonaUsers) @@ -177,7 +178,7 @@ export class UserDaemonServer extends UserDaemon { // For PersonaUsers, create client instance if (userEntity.type === 'persona') { - await this.createPersonaClient(userEntity); + await this.ensurePersonaClient(userEntity); } // HumanUser and AgentUser managed by SessionDaemon @@ -296,7 +297,7 @@ export class UserDaemonServer extends UserDaemon { } // STEP 3: Create PersonaUser client instance - await this.createPersonaClient(userEntity); + await this.ensurePersonaClient(userEntity); } catch (error) { this.log.error(`❌ UserDaemon: Failed to ensure state for ${userEntity.displayName}:`, error); @@ -348,6 +349,35 @@ export class UserDaemonServer extends UserDaemon { } } + /** + * Ensure only one runtime PersonaUser is constructed per persisted user. + * + * Startup has multiple legitimate entry points: DataDaemon system:ready, + * UserDaemon deferred init, and real user-created events. They can overlap + * during cold boot. The database identity is singleton, so the runtime client + * must be singleton too; duplicate instances mean duplicate event handlers, + * duplicate inbox drains, and duplicate model calls for one persona. + */ + private async ensurePersonaClient(userEntity: UserEntity): Promise { + if (this.personaClients.has(userEntity.id)) { + return; + } + + const inflight = this.personaClientInitializations.get(userEntity.id); + if (inflight) { + await inflight; + return; + } + + const initialization = this.createPersonaClient(userEntity) + .finally(() => { + this.personaClientInitializations.delete(userEntity.id); + }); + + this.personaClientInitializations.set(userEntity.id, initialization); + await initialization; + } + /** * Ensure user has UserState entity */ @@ -523,4 +553,4 @@ export class UserDaemonServer extends UserDaemon { } this.personaClients.clear(); } -} \ No newline at end of file +} diff --git a/src/scripts/launch-active-example.ts b/src/scripts/launch-active-example.ts index 7027b0082..3d75fffe5 100644 --- a/src/scripts/launch-active-example.ts +++ b/src/scripts/launch-active-example.ts @@ -26,7 +26,8 @@ async function launchActiveExample(): Promise { const systemState = await systemOrchestrator.orchestrate('system-start', { workingDir, verbose: true, - browserUrl: undefined // Use default from configuration + browserUrl: undefined, // Use default from configuration + skipBrowser: process.env.CONTINUUM_DEFER_BROWSER === '1' || process.env.CONTINUUM_DEFER_BROWSER === 'true' }); if (!systemState.success) { @@ -75,4 +76,4 @@ function cleanup() { } // Run the launcher -launchActiveExample(); \ No newline at end of file +launchActiveExample(); diff --git a/src/scripts/parallel-start.sh b/src/scripts/parallel-start.sh index 21da9e57d..1c46e5a30 100755 --- a/src/scripts/parallel-start.sh +++ b/src/scripts/parallel-start.sh @@ -386,13 +386,27 @@ echo -e "\n${YELLOW}Phase 4: Launch system${NC}" # Ensure log directory exists mkdir -p "$CONTINUUM_ROOT/jtag/logs/system" +STARTUP_AUTONOMOUS_PAUSE="$CONTINUUM_ROOT/jtag/startup-autonomous-work.paused" +echo "$$" > "$STARTUP_AUTONOMOUS_PAUSE" +cleanup_startup_pause() { + rm -f "$STARTUP_AUTONOMOUS_PAUSE" +} +trap cleanup_startup_pause EXIT # Start the orchestrator as a daemon — it runs forever (WebSocket server is in-process). -# Redirect output to log file. system-stop.sh finds it by pattern "launch-active-example". -nohup npx tsx scripts/launch-active-example.ts \ - >> $CONTINUUM_ROOT/jtag/logs/system/orchestrator.log 2>&1 & -LAUNCH_PID=$! -disown $LAUNCH_PID +# Use the project-local tsx binary directly; `npx` is a short-lived wrapper and +# has caused false "daemon" starts where the launcher dies after npm start exits. +# Redirect stdin as well as output so parent shell/PTY teardown cannot touch it. +# system-stop.sh finds it by pattern "launch-active-example". +# Browser attachment happens after seed below. Starting the orchestrator with +# browser management enabled lets stale tabs reconnect during seed and trigger +# persona/RAG/model work while the database is still being synchronized. +TSX_BIN="$PROJECT_DIR/node_modules/.bin/tsx" +LAUNCH_PID=$(node "$PROJECT_DIR/scripts/spawn-detached.mjs" \ + --cwd "$PROJECT_DIR" \ + --log "$CONTINUUM_ROOT/jtag/logs/system/orchestrator.log" \ + --env CONTINUUM_DEFER_BROWSER=1 \ + -- "$TSX_BIN" scripts/launch-active-example.ts) echo "$LAUNCH_PID" > $CONTINUUM_ROOT/jtag/logs/system/npm-start.pid echo -e " Orchestrator started (PID $LAUNCH_PID, log: $CONTINUUM_ROOT/jtag/logs/system/orchestrator.log)" @@ -471,11 +485,28 @@ if [ "$SEED_RC" -ne 0 ]; then else echo -e " ${GREEN}✅ Seed complete${NC}" fi +cleanup_startup_pause -# Phase 6: Browser launch is handled by SystemOrchestrator.detectAndManageBrowser() -# The orchestrator runs as a daemon and manages browser lifecycle — open, detect, reconnect. -# Shell script does NOT open the browser to avoid duplicate tabs (#335). +# Phase 6: Browser attach happens only after seed. This script owns the final +# post-seed refresh/open so the orchestrator cannot race UI hydration against +# database synchronization. BROWSER_CONNECTED=false +if [ "$SEED_OK" = true ]; then + echo -e " ${YELLOW}Attaching browser after seed...${NC}" + PING_OUTPUT=$(./jtag ping --timeout=5000 2>/dev/null || echo '{}') + if echo "$PING_OUTPUT" | grep -q '"browser"' 2>/dev/null; then + if ./jtag interface/navigate >/dev/null 2>&1; then + BROWSER_CONNECTED=true + echo -e " ${GREEN}Browser refreshed after seed${NC}" + else + ./jtag development/exec --code="location.reload()" >/dev/null 2>&1 || true + fi + elif command -v open >/dev/null 2>&1; then + open "http://localhost:9000/chat/general" >/dev/null 2>&1 || true + elif command -v xdg-open >/dev/null 2>&1; then + xdg-open "http://localhost:9000/chat/general" >/dev/null 2>&1 || true + fi +fi if [ "$HOT_RESTART" = true ]; then # Hot restart: give existing tab time to reconnect via WebSocket echo -e " ⏳ Waiting for browser to reconnect..." diff --git a/src/scripts/seed-continuum.ts b/src/scripts/seed-continuum.ts index 04fab0c35..0b803226e 100644 --- a/src/scripts/seed-continuum.ts +++ b/src/scripts/seed-continuum.ts @@ -15,6 +15,7 @@ import { DEFAULT_USER_UNIQUE_IDS } from '../system/data/domains/DefaultEntities' import { ROOM_UNIQUE_IDS } from '../system/data/constants/RoomConstants'; import { generateUUID } from '../system/core/types/CrossPlatformUUID'; import { UserEntity } from '../system/data/entities/UserEntity'; +import { BaseEntity } from '../system/data/entities/BaseEntity'; import { RoomEntity } from '../system/data/entities/RoomEntity'; import { ChatMessageEntity } from '../system/data/entities/ChatMessageEntity'; import { ContentTypeEntity } from '../system/data/entities/ContentTypeEntity'; @@ -39,6 +40,7 @@ import { execWithRetry, } from './seed/helpers'; +const execRawAsync = promisify(exec); const execAsync = execWithRetry; /** Sync recipe JSON files to database — truly idempotent, ignores "already exists" */ @@ -46,22 +48,75 @@ async function syncRecipesFromJson(): Promise { const recipesDir = path.join(__dirname, '..', 'system', 'recipes'); const recipeFiles = fs.readdirSync(recipesDir).filter(f => f.endsWith('.json')); console.log(` [Seed] 📝 Syncing ${recipeFiles.length} recipes...`); + const existingIds = new Set(); + try { + const { stdout } = await execRawAsync('./jtag data/list --collection=recipes --limit=1000 --skipCount=true --select=id', { timeout: 10000 }); + const parsed = JSON.parse(stdout); + for (const item of parsed.items || []) { + if (typeof item.id === 'string') existingIds.add(item.id); + } + } catch { + // Continue with create-first behavior if discovery fails. The per-record + // update fallback below still keeps the seed idempotent. + } let created = 0; - let existing = 0; + let updated = 0; + let unchanged = 0; + let failed = 0; for (const f of recipeFiles) { const data = JSON.parse(fs.readFileSync(path.join(recipesDir, f), 'utf-8')); const id = data.uniqueId; if (!id) continue; + const recipe = { + ...data, + id, + view: data.view || data.uniqueId, + entityType: data.entityType || null, + createdBy: data.createdBy || '00000000-0000-0000-0000-000000000000', + usageCount: data.usageCount || 0, + lastUsedAt: data.lastUsedAt || new Date().toISOString(), + tags: data.tags || [], + isPublic: data.isPublic !== false, + }; try { - const wasCreated = await createRecord('recipes', { ...data, id }, id, data.displayName || id); - if (wasCreated) created++; - else existing++; + if (!existingIds.has(id)) { + const wasCreated = await createRecord('recipes', recipe, id, data.displayName || id); + if (wasCreated) { + existingIds.add(id); + created++; + continue; + } + } + + const { stdout: readStdout } = await execRawAsync(`./jtag data/read --collection=recipes --id='${id}'`, { timeout: 10000 }); + const readResult = JSON.parse(readStdout); + if (readResult?.found && readResult?.data && !BaseEntity.hasContentDelta(readResult.data, recipe, { + ignoreFields: ['createdBy', 'lastUsedAt', 'usageCount'] + })) { + unchanged++; + continue; + } + + const updateData = { ...recipe }; + delete updateData.createdBy; + delete updateData.lastUsedAt; + delete updateData.usageCount; + const dataArg = JSON.stringify(updateData).replace(/'/g, `'"'"'`); + const { stdout } = await execAsync(`./jtag data/update --collection=recipes --id='${id}' --data='${dataArg}' --suppressEvents=true`); + if (stdout.includes('"success": true') || stdout.includes('"success":true')) { + updated++; + } else { + failed++; + console.error(` [Seed] ❌ Failed to update recipe ${data.displayName || id}: ${stdout.slice(0, 300)}`); + } } catch { - // "Record already exists" or other non-fatal error — skip silently - existing++; + failed++; } } - console.log(` [Seed] ✅ Synced recipes (${created} new, ${existing} existing)`); + if (failed > 0) { + throw new Error(`Failed to sync ${failed}/${recipeFiles.length} recipes`); + } + console.log(` [Seed] ✅ Synced recipes (${created} new, ${updated} updated, ${unchanged} unchanged)`); } // ===== PERSONA PROFILE DATA (single source of truth for all persona bios + colors) ===== @@ -261,7 +316,7 @@ async function waitForJTAGReady(maxWaitSeconds: number = 480): Promise while (Date.now() - startTime < maxWaitSeconds * 1000) { try { - const { stdout } = await execAsync('./jtag ping'); + const { stdout } = await execRawAsync('./jtag ping', { timeout: 10000 }); // ROBUST: Extract JSON from potentially polluted output const firstBrace = stdout.indexOf('{'); @@ -279,7 +334,13 @@ async function waitForJTAGReady(maxWaitSeconds: number = 480): Promise response.server?.health?.commandsRegistered > 0) { // Also verify Rust IPC is connected — seed depends on data/create which goes through Rust ORM try { - const { stdout: dbCheck } = await execAsync('./jtag data/list --collection=users --limit=1', { timeout: 10000 }); + // Use the real Rust-backed ORM path, but keep the probe cheap. The + // previous `data/list --collection=users --limit=1` performed a COUNT + // plus a full-row query every retry; on cold start that turned the + // health check itself into data/query memory churn. `skipCount` and a + // single-column projection prove the data path is alive without + // competing with seed/persona startup. + const { stdout: dbCheck } = await execRawAsync('./jtag data/list --collection=users --limit=1 --skipCount=true --select=id', { timeout: 10000 }); if (dbCheck.includes('"success":true') || dbCheck.includes('"success": true')) { console.log(`✅ JTAG ready with ${response.server.health.commandsRegistered} commands + Rust IPC confirmed`); return true; @@ -293,6 +354,7 @@ async function waitForJTAGReady(maxWaitSeconds: number = 480): Promise if (attempts % 5 === 0) { console.log(` TS server ready but Rust worker not responding...`); console.log(` DEBUG: ${dbErr?.message || dbErr}`); + console.log(` DEBUG stdout: ${dbErr?.stdout?.slice?.(0, 500) || 'none'}`); console.log(` DEBUG stderr: ${dbErr?.stderr?.slice?.(0, 200) || 'none'}`); } } diff --git a/src/scripts/spawn-detached.mjs b/src/scripts/spawn-detached.mjs new file mode 100644 index 000000000..d832549d1 --- /dev/null +++ b/src/scripts/spawn-detached.mjs @@ -0,0 +1,70 @@ +#!/usr/bin/env node +import { openSync } from 'fs'; +import { spawn } from 'child_process'; + +const args = process.argv.slice(2); +let cwd = process.cwd(); +let logPath = null; +let ulimitVirtualMemoryKb = null; +const env = { ...process.env }; +let i = 0; + +for (; i < args.length; i += 1) { + const arg = args[i]; + if (arg === '--') { + i += 1; + break; + } + if (arg === '--cwd') { + cwd = args[++i]; + continue; + } + if (arg === '--log') { + logPath = args[++i]; + continue; + } + if (arg === '--env') { + const assignment = args[++i]; + const equalsIndex = assignment.indexOf('='); + if (equalsIndex <= 0) { + throw new Error(`Invalid --env assignment: ${assignment}`); + } + env[assignment.slice(0, equalsIndex)] = assignment.slice(equalsIndex + 1); + continue; + } + if (arg === '--ulimit-v-kb') { + ulimitVirtualMemoryKb = args[++i]; + continue; + } + throw new Error(`Unknown option: ${arg}`); +} + +let command = args[i]; +let commandArgs = args.slice(i + 1); +if (!command) { + throw new Error('Usage: spawn-detached.mjs [--cwd DIR] [--log FILE] [--env K=V] -- command [args...]'); +} + +if (ulimitVirtualMemoryKb) { + commandArgs = [ + '-lc', + 'ulimit -v "$1" 2>/dev/null || true; shift; exec "$@"', + 'spawn-detached-ulimit', + String(ulimitVirtualMemoryKb), + command, + ...commandArgs, + ]; + command = '/bin/bash'; +} + +const out = logPath ? openSync(logPath, 'a') : 'ignore'; +const err = logPath ? out : 'ignore'; +const child = spawn(command, commandArgs, { + cwd, + env, + detached: true, + stdio: ['ignore', out, err], +}); + +child.unref(); +console.log(child.pid); diff --git a/src/system/coordination/shared/BaseCoordinationStream.ts b/src/system/coordination/server/BaseCoordinationStream.ts similarity index 97% rename from src/system/coordination/shared/BaseCoordinationStream.ts rename to src/system/coordination/server/BaseCoordinationStream.ts index 267ac0d0a..19399e997 100644 --- a/src/system/coordination/shared/BaseCoordinationStream.ts +++ b/src/system/coordination/server/BaseCoordinationStream.ts @@ -21,10 +21,8 @@ */ import { EventEmitter } from 'events'; -import * as path from 'path'; import type { UUID } from '../../core/types/CrossPlatformUUID'; -import { Logger, FileMode, type ComponentLogger } from '../../core/logging/Logger'; -import { SystemPaths } from '../../core/config/SystemPaths'; +import { Logger, type ComponentLogger } from '../../core/logging/Logger'; /** * Domain-agnostic thought (claim to respond) @@ -187,15 +185,11 @@ export abstract class BaseCoordinationStream< } /** - * Hook: Get probabilistic max responders + * Hook: Get max responders. * Subclasses can customize slot allocation */ protected getMaxResponders(): number { - // Default: probabilistic (70% = 1, 25% = 2, 5% = 3) - const rand = Math.random(); - if (rand < 0.70) return 1; - if (rand < 0.95) return 2; - return 3; + return this.config.maxResponders; } /** diff --git a/src/system/coordination/server/ChatCoordinationStream.ts b/src/system/coordination/server/ChatCoordinationStream.ts index 71c85810c..50ce74cba 100644 --- a/src/system/coordination/server/ChatCoordinationStream.ts +++ b/src/system/coordination/server/ChatCoordinationStream.ts @@ -21,7 +21,7 @@ import { type BaseDecision, type BaseStream, type CoordinationConfig -} from '../shared/BaseCoordinationStream'; +} from './BaseCoordinationStream'; /** * Chat-specific thought (extends base with chat metadata) diff --git a/src/system/core/system/server/ServiceInitializer.ts b/src/system/core/system/server/ServiceInitializer.ts index 9783295ec..5933068df 100644 --- a/src/system/core/system/server/ServiceInitializer.ts +++ b/src/system/core/system/server/ServiceInitializer.ts @@ -13,23 +13,33 @@ import { Logger } from '../../logging/Logger'; const log = Logger.create('ServiceInitializer'); +export function shouldInitializeCodebaseIndexing( + env: NodeJS.ProcessEnv = process.env, + nodeEnv: string | undefined = process.env.NODE_ENV, +): boolean { + if (env.SKIP_CODEBASE_INDEX === '1' || env.SKIP_CODEBASE_INDEX === 'true') { + return false; + } + if (nodeEnv === 'production') { + return false; + } + return env.CONTINUUM_ENABLE_CODEBASE_INDEX === '1' || env.CONTINUUM_ENABLE_CODEBASE_INDEX === 'true'; +} + /** - * Background codebase indexing — runs incremental index after startup. - * Fire-and-forget: doesn't block server startup, logs results. - * - * Skippable via SKIP_CODEBASE_INDEX=1 for validation / debugging when the - * indexer's data/query saturation masks unrelated chat-path issues. The - * indexer is an optimization; disabling it doesn't break chat or personas. + * Background codebase indexing — runs incremental index only when explicitly + * enabled. Code RAG is useful enrichment, but it is not a boot dependency. On + * a fresh checkout it can generate thousands of code_index writes and sustained + * ONNX embedding batches; doing that during seed/readiness starves chat, + * persona inbox service, and first-run UX. */ function initializeCodebaseIndexing(): void { - if (process.env.SKIP_CODEBASE_INDEX === '1' || process.env.SKIP_CODEBASE_INDEX === 'true') { - log.info('Background codebase indexing SKIPPED (SKIP_CODEBASE_INDEX set)'); + if (!shouldInitializeCodebaseIndexing()) { + log.info('Background codebase indexing skipped (set CONTINUUM_ENABLE_CODEBASE_INDEX=1 to enable)'); return; } - // Delay 120s — personas must boot and respond to first chats before - // indexing starts. At 10s the embedding storm saturates the event loop - // and blocks ALL persona responses for 2+ minutes. Chat is the product; - // codebase search is optimization that can wait. + // Delay 120s even when explicitly enabled. This gives seed + first chat a + // clean lane before the embedding-heavy indexer starts. setTimeout(async () => { try { const { getCodebaseIndexer } = await import('../../../rag/services/CodebaseIndexer'); @@ -89,14 +99,8 @@ export async function initializeServices(): Promise { initializeTrainingRecovery(); log.debug('Training recovery service initialized'); - // Codebase indexing: background incremental index so personas can answer code questions. - // Skip in production/Docker — no source tree to index, and the ORM.store() events - // (data:code_index:created × thousands) peg the CPU at 100% and starve voice/chat. - if (process.env.NODE_ENV !== 'production') { - initializeCodebaseIndexing(); - } else { - log.info('Skipping codebase indexing (production mode)'); - } + // Codebase indexing is opt-in. It is RAG enrichment, not readiness. + initializeCodebaseIndexing(); const ms = Date.now() - start; log.info(`Cross-cutting services initialized (${ms}ms)`); diff --git a/src/system/data/entities/BaseEntity.ts b/src/system/data/entities/BaseEntity.ts index 5cd4b78d4..ed60826d2 100644 --- a/src/system/data/entities/BaseEntity.ts +++ b/src/system/data/entities/BaseEntity.ts @@ -91,6 +91,58 @@ export abstract class BaseEntity { }; } + /** + * Deterministic content fingerprint for "do I need to update?" decisions. + * Callers compare semantic fields, not ORM churn fields such as updatedAt. + * This keeps seed/sync/update flows idempotent without per-script equality + * rules. + */ + static contentFingerprint( + data: Record, + options: { ignoreFields?: string[] } = {} + ): string { + const ignore = new Set([ + 'createdAt', + 'updatedAt', + 'version', + ...(options.ignoreFields ?? []) + ]); + return BaseEntity.stableContentString(BaseEntity.pickComparableFields(data, ignore)); + } + + static hasContentDelta( + existing: Record, + desired: Record, + options: { ignoreFields?: string[] } = {} + ): boolean { + const desiredKeys = new Set(Object.keys(desired)); + const existingProjection: Record = {}; + for (const key of desiredKeys) { + existingProjection[key] = existing[key] ?? null; + } + return BaseEntity.contentFingerprint(existingProjection, options) !== + BaseEntity.contentFingerprint(desired, options); + } + + private static pickComparableFields(data: Record, ignore: Set): Record { + const picked: Record = {}; + for (const [key, value] of Object.entries(data)) { + if (!ignore.has(key)) picked[key] = value ?? null; + } + return picked; + } + + private static stableContentString(value: unknown): string { + if (value === undefined) return 'null'; + if (value === null || typeof value !== 'object') return JSON.stringify(value); + if (value instanceof Date) return JSON.stringify(value.toISOString()); + if (Array.isArray(value)) { + return `[${value.map(item => BaseEntity.stableContentString(item)).join(',')}]`; + } + const obj = value as Record; + return `{${Object.keys(obj).sort().map(key => `${JSON.stringify(key)}:${BaseEntity.stableContentString(obj[key])}`).join(',')}}`; + } + /** * Factory method to create entities with validation */ @@ -189,4 +241,4 @@ export abstract class BaseEntity { type: eventType }; } -} \ No newline at end of file +} diff --git a/src/system/orchestration/SystemOrchestrator.ts b/src/system/orchestration/SystemOrchestrator.ts index 7bc8077a9..3aaa094c0 100644 --- a/src/system/orchestration/SystemOrchestrator.ts +++ b/src/system/orchestration/SystemOrchestrator.ts @@ -427,7 +427,7 @@ export class SystemOrchestrator extends EventEmitter { return await this.executeBrowserInterface(); case SYSTEM_MILESTONES.BROWSER_READY: - return await this.executeBrowserReady(); + return await this.executeBrowserReady(options); case SYSTEM_MILESTONES.SYSTEM_HEALTHY: return await this.executeSystemHealthy(); @@ -1328,7 +1328,16 @@ export class SystemOrchestrator extends EventEmitter { return true; } - private async executeBrowserReady(): Promise { + private async executeBrowserReady(options: OrchestrationOptions): Promise { + if (options.skipBrowser) { + console.debug('⏭️ Browser readiness deferred (skipBrowser option)'); + await milestoneEmitter.completeMilestone( + SYSTEM_MILESTONES.BROWSER_READY, + this.currentEntryPoint + ); + return true; + } + console.debug('⏳ Waiting for browser to be ready...'); // For now, assume browser is ready after launch diff --git a/src/system/user/server/PersonaLifecycleManager.ts b/src/system/user/server/PersonaLifecycleManager.ts index e7741c90f..1e4c2e213 100644 --- a/src/system/user/server/PersonaLifecycleManager.ts +++ b/src/system/user/server/PersonaLifecycleManager.ts @@ -113,16 +113,16 @@ export class PersonaLifecycleManager { console.log(`✅ PersonaLifecycleManager: ${created} persona(s) activated on startup`); - // Cold-start prewarming: fire a tiny no-op generation per local persona - // so DMR loads the model + warms the slot BEFORE the user's first message. - // Without this, the first real chat eats a ~6s model-load cold start - // PLUS the normal generation time — felt like an eternity ("ais take a - // long time to load"). With prewarm, the model is resident and ready; - // first chat hits a warm slot. - // - // Fire-and-forget: doesn't block boot, doesn't fail boot if DMR is down. - // Cloud personas are skipped — their providers are already "warm" by API. - void this.prewarmAllPersonas(allocation.allocations); + // Local model prewarm allocates the full model/KV context. Doing that at + // boot competes with seed, browser reconnect, and first room hydration, and + // on unified-memory Macs can push continuum-core into OS pressure before + // the system is actually ready. Keep it as an explicit performance knob, + // not default startup behavior. + if (process.env.CONTINUUM_PREWARM_PERSONAS === '1' || process.env.CONTINUUM_PREWARM_PERSONAS === 'true') { + void this.prewarmAllPersonas(allocation.allocations); + } else { + console.log('⏭️ PersonaLifecycleManager: local model prewarm skipped (set CONTINUUM_PREWARM_PERSONAS=1 to enable)'); + } } /** diff --git a/src/system/user/server/PersonaUser.ts b/src/system/user/server/PersonaUser.ts index 319fb40ed..d8f8073d9 100644 --- a/src/system/user/server/PersonaUser.ts +++ b/src/system/user/server/PersonaUser.ts @@ -1234,7 +1234,12 @@ export class PersonaUser extends AIUser { /** * Catch up on messages since last processed bookmark * Uses roomReadState from UserStateEntity to track per-room progress - * Ensures no messages are missed even after system restart + * Startup policy: + * - Default: bookmark the current tail for every room; do not generate from + * historical backlog during boot. Restart is not a "catch up" moment: + * generating from old room traffic caused startup storms and stale replies. + * - Opt-in: CONTINUUM_PROCESS_STARTUP_BACKLOG=1 consolidates backlog into one + * latest-room signal per room for explicit replay tests. */ private async catchUpOnRecentMessages(): Promise { try { @@ -1245,12 +1250,43 @@ export class PersonaUser extends AIUser { } let totalCaughtUp = 0; + let totalBookmarked = 0; + const processStartupBacklog = process.env.CONTINUUM_PROCESS_STARTUP_BACKLOG === '1' || + process.env.CONTINUUM_PROCESS_STARTUP_BACKLOG === 'true'; // Process each room's bookmark independently for (const roomId of roomIds) { + const latest = await ORM.query({ + collection: COLLECTIONS.CHAT_MESSAGES, + filter: { + roomId, + senderId: { $ne: this.id }, + senderType: { $ne: 'system' } + }, + sort: [{ field: 'timestamp', direction: 'desc' }], + limit: 1 + }, 'default'); + + const latestMessage = latest.success && latest.data?.[0]?.data; + if (!latestMessage) { + continue; + } + + if (!processStartupBacklog) { + await this.updateMessageBookmark(roomId, latestMessage.timestamp, latestMessage.id); + totalBookmarked += 1; + continue; + } + // Direct property access (state may be plain object from DB) const roomState = this.state.roomReadState?.[roomId]; - const cutoffTime = roomState?.lastReadMessageTimestamp || new Date(0).toISOString(); + const cutoffTime = roomState?.lastReadMessageTimestamp; + + if (!cutoffTime) { + await this.updateMessageBookmark(roomId, latestMessage.timestamp, latestMessage.id); + totalBookmarked += 1; + continue; + } const recentMessages = await ORM.query({ collection: COLLECTIONS.CHAT_MESSAGES, @@ -1269,17 +1305,19 @@ export class PersonaUser extends AIUser { } const messages = recentMessages.data.map(r => r.data); - this.log.info(`🔄 ${this.displayName}: Catching up on ${messages.length} messages in room ${roomId.slice(0,8)}`); - - for (const message of messages) { - await this.handleChatMessage(message); - } + const latestBacklogMessage = messages[messages.length - 1]; + this.log.info(`🔄 ${this.displayName}: Consolidating ${messages.length} catch-up messages in room ${roomId.slice(0,8)} into one latest-room signal`); - totalCaughtUp += messages.length; + await this.handleChatMessage(latestBacklogMessage); + totalCaughtUp += 1; } if (totalCaughtUp > 0) { - this.log.info(`✅ ${this.displayName}: Catch-up complete (${totalCaughtUp} messages)`); + this.log.info(`✅ ${this.displayName}: Catch-up complete (${totalCaughtUp} consolidated room signal(s))`); + } + + if (totalBookmarked > 0) { + this.log.info(`🔖 ${this.displayName}: Startup catch-up advanced ${totalBookmarked} room bookmark(s) to current tail; backlog generation disabled`); } } catch (error) { this.log.warn(`⚠️ ${this.displayName}: Catch-up failed (non-fatal):`, error); diff --git a/src/system/user/server/modules/PersonaAutonomousLoop.ts b/src/system/user/server/modules/PersonaAutonomousLoop.ts index 6ff028290..0dff76a18 100644 --- a/src/system/user/server/modules/PersonaAutonomousLoop.ts +++ b/src/system/user/server/modules/PersonaAutonomousLoop.ts @@ -26,6 +26,7 @@ import type { SelfTaskGenerator } from './SelfTaskGenerator'; import type { PersonaUser } from '../PersonaUser'; import { PersonaTimingConfig } from './PersonaTimingConfig'; import { BackpressureService } from '../../../core/services/BackpressureService'; +import { StartupAutonomousWorkGate } from './StartupAutonomousWorkGate'; /** Gap assessment runs every N service cycles (~25-50s during active operation) */ const GAP_ASSESSMENT_INTERVAL = PersonaTimingConfig.selfTask.gapAssessmentInterval; @@ -97,6 +98,8 @@ export class PersonaAutonomousLoop { private async runServiceLoop(): Promise { const { maxConsecutiveFailures, cooldownMs } = PersonaTimingConfig.circuitBreaker; + await StartupAutonomousWorkGate.waitUntilOpen(this.log, `${this.personaUser.displayName} startup drain`); + // Drain anything queued in Rust BEFORE the service loop started. // Race: chat items routed via PersonaInbox.route → channelEnqueue // emit 'work-available' on the TS signal IMMEDIATELY. If no listener @@ -163,6 +166,8 @@ export class PersonaAutonomousLoop { * 2. Drain loop: call Rust serviceCycleFull repeatedly until queue empty */ private async serviceInbox(): Promise { + await StartupAutonomousWorkGate.waitUntilOpen(this.log, `${this.personaUser.displayName} inbox service`); + const cadence = this.personaUser.prefrontal!.personaState.getCadence(); const hasWork = await this.personaUser.inbox.waitForWork(cadence); diff --git a/src/system/user/server/modules/PersonaMessageEvaluator.ts b/src/system/user/server/modules/PersonaMessageEvaluator.ts index 8dea4a511..118d2bb3a 100644 --- a/src/system/user/server/modules/PersonaMessageEvaluator.ts +++ b/src/system/user/server/modules/PersonaMessageEvaluator.ts @@ -30,7 +30,7 @@ import type { RAGContext } from '../../../data/entities/CoordinationDecisionEnti import type { RAGContext as PipelineRAGContext, RAGArtifact } from '../../../rag/shared/RAGTypes'; import { truncate } from '../../../../shared/utils/StringUtils'; import type { DecisionContext } from './cognition/adapters/IDecisionAdapter'; -import { getChatCoordinator } from '../../../coordination/server/ChatCoordinationStream'; +import { getChatCoordinator, type ChatThought } from '../../../coordination/server/ChatCoordinationStream'; import { calculateMessagePriority } from './PersonaInbox'; import { toInboxMessageRequest } from './RustCognitionBridge'; import type { SenderType, FullEvaluateResult, SocialSignals } from '../../../../shared/generated'; @@ -175,6 +175,18 @@ export class PersonaMessageEvaluator { return; } + const coordinationStart = Date.now(); + const claimGranted = await this.coordinateResponseClaim(messageEntity, earlyResult); + evalTiming['coordination_claim'] = Date.now() - coordinationStart; + if (!claimGranted) { + this.personaUser.logAIDecision('SILENT', 'coordination: another persona owns this turn', { + message: safeMessageText.slice(0, 100), + sender: messageEntity.senderName, + roomId: messageEntity.roomId, + }); + return; + } + // ECHO CHAMBER: Now handled by Rust Gate 6 inside fullEvaluate() above. // No separate TS-side check needed — Rust checks echo chamber atomically. @@ -718,6 +730,42 @@ export class PersonaMessageEvaluator { this.log(`🧠 ${this.personaUser.displayName}: State updated (energy=${this.personaUser.personaState.getState().energy.toFixed(2)}, mood=${this.personaUser.personaState.getState().mood})`); } + /** + * One room message should become one coordinated response turn unless the + * room explicitly allows more responders. The cheap Rust gate may say several + * personas are eligible; this claim step selects the responder before RAG, + * memory recall, embeddings, or generation begin. + */ + private async coordinateResponseClaim( + messageEntity: ProcessableMessage, + earlyResult: FullEvaluateResult, + ): Promise { + const coordinator = getChatCoordinator(); + const thought: ChatThought = { + personaId: this.personaUser.id, + personaName: this.personaUser.displayName, + type: 'claiming', + confidence: earlyResult.confidence, + reasoning: `${earlyResult.gate}: ${earlyResult.reason}`, + timestamp: Date.now(), + messageId: messageEntity.id, + roomId: messageEntity.roomId, + }; + + await coordinator.broadcastChatThought(messageEntity.id, messageEntity.roomId, thought); + const decision = await coordinator.waitForChatDecision(messageEntity.id); + if (!decision) { + this.log(`⏰ ${this.personaUser.displayName}: Coordination timeout for ${messageEntity.id.slice(0, 8)} — deferring`); + return false; + } + + const granted = decision.granted.includes(this.personaUser.id); + if (!granted) { + this.log(`🧵 ${this.personaUser.displayName}: Deferring ${messageEntity.id.slice(0, 8)} to coordinated responder`); + } + return granted; + } + /** * Build CoordinationDecision RAGContext from ChatRAGBuilder output * Converts domain-specific RAG format to universal decision logging format diff --git a/src/system/user/server/modules/StartupAutonomousWorkGate.ts b/src/system/user/server/modules/StartupAutonomousWorkGate.ts new file mode 100644 index 000000000..688a04276 --- /dev/null +++ b/src/system/user/server/modules/StartupAutonomousWorkGate.ts @@ -0,0 +1,77 @@ +import fs from 'fs'; +import path from 'path'; +import { SystemPaths } from '../../../core/config/SystemPaths'; + +const DEFAULT_PAUSE_FILE = path.join(SystemPaths.root, 'jtag', 'startup-autonomous-work.paused'); +const DEFAULT_MAX_WAIT_MS = 10 * 60 * 1000; +const DEFAULT_POLL_MS = 1000; + +export class StartupAutonomousWorkGate { + static get pauseFile(): string { + return process.env.CONTINUUM_STARTUP_AUTONOMOUS_PAUSE_FILE || DEFAULT_PAUSE_FILE; + } + + static isPaused(): boolean { + if (process.env.CONTINUUM_AUTONOMOUS_WORK_PAUSED === '1' || process.env.CONTINUUM_AUTONOMOUS_WORK_PAUSED === 'true') { + return true; + } + + const pauseFile = this.pauseFile; + if (!fs.existsSync(pauseFile)) { + return false; + } + + const ownerPid = this.readOwnerPid(pauseFile); + if (ownerPid !== null && !this.isProcessAlive(ownerPid)) { + fs.rmSync(pauseFile, { force: true }); + return false; + } + + return true; + } + + static async waitUntilOpen( + log?: (message: string) => void, + label: string = 'autonomous work', + options: { maxWaitMs?: number; pollMs?: number } = {} + ): Promise { + if (!this.isPaused()) return; + + const maxWaitMs = options.maxWaitMs ?? DEFAULT_MAX_WAIT_MS; + const pollMs = options.pollMs ?? DEFAULT_POLL_MS; + const startedAt = Date.now(); + log?.(`⏸️ Startup gate closed — deferring ${label} until seed completes`); + while (this.isPaused()) { + if (Date.now() - startedAt >= maxWaitMs) { + log?.(`⚠️ Startup gate still closed after ${Math.round(maxWaitMs / 1000)}s — failing open for ${label}`); + return; + } + await new Promise(resolve => setTimeout(resolve, pollMs)); + } + log?.(`▶️ Startup gate open — resuming ${label}`); + } + + private static readOwnerPid(pauseFile: string): number | null { + try { + const raw = fs.readFileSync(pauseFile, 'utf8').trim(); + if (!/^\d+$/.test(raw)) { + return null; + } + return Number(raw); + } catch { + return null; + } + } + + private static isProcessAlive(pid: number): boolean { + if (!Number.isSafeInteger(pid) || pid <= 0) { + return false; + } + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } + } +} diff --git a/src/tests/unit/chat-coordination-stream.test.ts b/src/tests/unit/chat-coordination-stream.test.ts new file mode 100644 index 000000000..f699c140b --- /dev/null +++ b/src/tests/unit/chat-coordination-stream.test.ts @@ -0,0 +1,58 @@ +import { describe, expect, it } from 'vitest'; +import { ChatCoordinationStream, type ChatThought } from '../../system/coordination/server/ChatCoordinationStream'; +import type { UUID } from '../../system/core/types/CrossPlatformUUID'; + +function thought(personaId: string, confidence: number, messageId: string = 'message-1'): ChatThought { + return { + personaId: personaId as UUID, + personaName: personaId, + type: 'claiming', + confidence, + reasoning: 'unit-test claim', + timestamp: Date.now(), + messageId, + roomId: '00000000-0000-4000-8000-000000000001' as UUID, + }; +} + +describe('ChatCoordinationStream', () => { + it('grants only the configured responder count for a chat turn', async () => { + const roomId = '00000000-0000-4000-8000-000000000001' as UUID; + const coordinator = new ChatCoordinationStream({ + maxResponders: 1, + intentionWindowMs: 10, + enableLogging: false, + }); + + await coordinator.broadcastChatThought('message-1', roomId, thought('00000000-0000-4000-8000-000000000011', 0.6)); + await coordinator.broadcastChatThought('message-1', roomId, thought('00000000-0000-4000-8000-000000000012', 0.9)); + + const decision = await coordinator.waitForChatDecision('message-1', 100); + coordinator.shutdown(); + + expect(decision?.granted).toEqual(['00000000-0000-4000-8000-000000000012']); + expect(decision?.denied).toContain('00000000-0000-4000-8000-000000000011'); + }); + + it('grants multiple responders by configured confidence order', async () => { + const roomId = '00000000-0000-4000-8000-000000000001' as UUID; + const coordinator = new ChatCoordinationStream({ + maxResponders: 2, + intentionWindowMs: 10, + enableLogging: false, + }); + + await coordinator.broadcastChatThought('message-2', roomId, thought('00000000-0000-4000-8000-000000000021', 0.4, 'message-2')); + await coordinator.broadcastChatThought('message-2', roomId, thought('00000000-0000-4000-8000-000000000022', 0.95, 'message-2')); + await coordinator.broadcastChatThought('message-2', roomId, thought('00000000-0000-4000-8000-000000000023', 0.8, 'message-2')); + + const decision = await coordinator.waitForChatDecision('message-2', 100); + coordinator.shutdown(); + + expect(decision?.granted).toEqual([ + '00000000-0000-4000-8000-000000000022', + '00000000-0000-4000-8000-000000000023', + ]); + expect(decision?.denied).toEqual(['00000000-0000-4000-8000-000000000021']); + }); +}); diff --git a/src/tests/unit/service-initializer.test.ts b/src/tests/unit/service-initializer.test.ts new file mode 100644 index 000000000..4f481c7d1 --- /dev/null +++ b/src/tests/unit/service-initializer.test.ts @@ -0,0 +1,26 @@ +import { describe, expect, it } from 'vitest'; +import { shouldInitializeCodebaseIndexing } from '../../system/core/system/server/ServiceInitializer'; + +describe('ServiceInitializer', () => { + describe('shouldInitializeCodebaseIndexing', () => { + it('keeps codebase indexing off by default during development startup', () => { + expect(shouldInitializeCodebaseIndexing({}, 'development')).toBe(false); + }); + + it('allows explicit opt-in outside production', () => { + expect(shouldInitializeCodebaseIndexing({ CONTINUUM_ENABLE_CODEBASE_INDEX: '1' }, 'development')).toBe(true); + expect(shouldInitializeCodebaseIndexing({ CONTINUUM_ENABLE_CODEBASE_INDEX: 'true' }, 'test')).toBe(true); + }); + + it('lets skip override opt-in', () => { + expect(shouldInitializeCodebaseIndexing({ + CONTINUUM_ENABLE_CODEBASE_INDEX: '1', + SKIP_CODEBASE_INDEX: '1', + }, 'development')).toBe(false); + }); + + it('never auto-indexes in production startup', () => { + expect(shouldInitializeCodebaseIndexing({ CONTINUUM_ENABLE_CODEBASE_INDEX: '1' }, 'production')).toBe(false); + }); + }); +}); diff --git a/src/tests/unit/shared-node-boundary.test.ts b/src/tests/unit/shared-node-boundary.test.ts new file mode 100644 index 000000000..41cefe4ad --- /dev/null +++ b/src/tests/unit/shared-node-boundary.test.ts @@ -0,0 +1,86 @@ +import { describe, expect, it } from 'vitest'; +import { readdirSync, readFileSync, statSync } from 'fs'; +import { join, relative } from 'path'; + +const ROOT = process.cwd(); +const NODE_IMPORT_PATTERN = + /(?:from|import)\s+['"](?:node:)?(?:fs|fs\/promises|path|crypto|os|child_process|events)['"]|from\s+['"](?:node:)?(?:fs|fs\/promises|path|crypto|os|child_process|events)['"]|require\(['"](?:node:)?(?:fs|fs\/promises|path|crypto|os|child_process|events)['"]\)/; + +// Ratchet, not approval: these are existing shared/browser-boundary violations. +// New paths should not be added casually. If a shared module genuinely needs a +// Node builtin, move it under a server-only boundary where possible; otherwise +// document the architectural reason in the commit that updates this set. +const KNOWN_SHARED_NODE_IMPORTS = new Set([ + 'commands/ai/dataset/shared/parsers/GitHistoryParser.ts', + 'commands/list/shared/ListCommand.ts', + 'commands/logs/shared/LogsShared.ts', + 'commands/media/process/shared/MediaProcessTypes.ts', + 'commands/utilities/docs/shared/DocFileRegistry.ts', + 'commands/workspace/git/shared/resolveWorkspacePath.ts', + 'daemons/ai-provider-daemon/adapters/candle/shared/CandleAdapter.ts', + 'daemons/ai-provider-daemon/adapters/sentinel/shared/SentinelAdapter.ts', + 'daemons/ai-provider-daemon/shared/BaseAIProviderAdapter.ts', + 'daemons/ai-provider-daemon/shared/HardwareProfile.ts', + 'daemons/ai-provider-daemon/shared/LlamaCppAdapter.ts', + 'daemons/ai-provider-daemon/shared/adapters/BaseLocalAdapter.ts', + 'daemons/file-daemon/shared/FileDaemon.ts', + 'examples/shared/ConnectionConfigFactory.ts', + 'generator/shared/SpecSerializer.ts', + 'scripts/shared/Preflight.ts', + 'shared/ModelRegistry.ts', + 'shared/ipc/archive-worker/CommandRouterServer.ts', + 'shared/utils/ProcessUtils.ts', + 'shared/workers/PersonaWorkerThread.ts', + 'system/core/router/shared/JTAGRouterOptimized.ts', + 'system/core/shared/TimingHarness.ts', + 'system/rag/shared/PromptCapture.ts', + 'system/shared/Config.ts', + 'system/typescript/shared/TypeScriptCompiler.ts', + 'system/user/shared/BaseUser.ts', + 'tests/shared/AdvancedPerformanceTester.ts', + 'tests/shared/PerformanceTester.ts', + 'tests/shared/ScreenshotTesting.ts', + 'tests/shared/TestAssertions.ts', + 'tests/shared/TestConfig.ts', + 'tests/shared/TestRunner.ts', +]); + +function walk(dir: string): string[] { + const results: string[] = []; + for (const entry of readdirSync(dir)) { + if (entry === 'node_modules' || entry === 'dist' || entry === 'build') { + continue; + } + + const fullPath = join(dir, entry); + const stat = statSync(fullPath); + if (stat.isDirectory()) { + results.push(...walk(fullPath)); + } else if (entry.endsWith('.ts') || entry.endsWith('.tsx')) { + results.push(fullPath); + } + } + return results; +} + +function isSharedRuntimeFile(file: string): boolean { + const rel = relative(ROOT, file).replaceAll('\\', '/'); + if (rel.includes('/server/') || rel.includes('/test/') || rel.includes('.test.')) { + return false; + } + + return rel.startsWith('shared/') || + rel.includes('/shared/'); +} + +describe('shared/browser Node import boundary', () => { + it('does not add new Node builtin imports to shared runtime modules', () => { + const offenders = walk(ROOT) + .filter(isSharedRuntimeFile) + .filter(file => NODE_IMPORT_PATTERN.test(readFileSync(file, 'utf8'))) + .map(file => relative(ROOT, file).replaceAll('\\', '/')) + .sort(); + + expect(offenders).toEqual([...KNOWN_SHARED_NODE_IMPORTS].sort()); + }); +}); diff --git a/src/tests/unit/startup-autonomous-work-gate.test.ts b/src/tests/unit/startup-autonomous-work-gate.test.ts new file mode 100644 index 000000000..2097092af --- /dev/null +++ b/src/tests/unit/startup-autonomous-work-gate.test.ts @@ -0,0 +1,48 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { mkdtempSync, rmSync, writeFileSync } from 'fs'; +import { join } from 'path'; +import { tmpdir } from 'os'; +import { StartupAutonomousWorkGate } from '../../system/user/server/modules/StartupAutonomousWorkGate'; + +const originalPauseFile = process.env.CONTINUUM_STARTUP_AUTONOMOUS_PAUSE_FILE; +const originalEnvPause = process.env.CONTINUUM_AUTONOMOUS_WORK_PAUSED; + +afterEach(() => { + if (originalPauseFile === undefined) { + delete process.env.CONTINUUM_STARTUP_AUTONOMOUS_PAUSE_FILE; + } else { + process.env.CONTINUUM_STARTUP_AUTONOMOUS_PAUSE_FILE = originalPauseFile; + } + + if (originalEnvPause === undefined) { + delete process.env.CONTINUUM_AUTONOMOUS_WORK_PAUSED; + } else { + process.env.CONTINUUM_AUTONOMOUS_WORK_PAUSED = originalEnvPause; + } +}); + +describe('StartupAutonomousWorkGate', () => { + it('removes stale owner-pid pause files instead of blocking forever', () => { + const dir = mkdtempSync(join(tmpdir(), 'continuum-startup-gate-')); + const pauseFile = join(dir, 'startup-autonomous-work.paused'); + process.env.CONTINUUM_STARTUP_AUTONOMOUS_PAUSE_FILE = pauseFile; + writeFileSync(pauseFile, '999999999'); + + expect(StartupAutonomousWorkGate.isPaused()).toBe(false); + + rmSync(dir, { recursive: true, force: true }); + }); + + it('fails open after max wait when an explicit env pause is left set', async () => { + const messages: string[] = []; + process.env.CONTINUUM_AUTONOMOUS_WORK_PAUSED = '1'; + + await StartupAutonomousWorkGate.waitUntilOpen( + message => messages.push(message), + 'unit test', + { maxWaitMs: 5, pollMs: 1 } + ); + + expect(messages.some(message => message.includes('failing open'))).toBe(true); + }); +}); diff --git a/src/workers/continuum-core/src/modules/channel.rs b/src/workers/continuum-core/src/modules/channel.rs index 0723268e0..9715b223a 100644 --- a/src/workers/continuum-core/src/modules/channel.rs +++ b/src/workers/continuum-core/src/modules/channel.rs @@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use std::any::Any; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use ts_rs::TS; use uuid::Uuid; @@ -78,6 +78,15 @@ pub struct ChannelState { pub self_task_generators: DashMap>, /// Tick configuration — adjustable at runtime via channel/tick-config command. pub tick_config: std::sync::RwLock, + /// Circuit breaker for DB-backed tick work. One failing Postgres path should + /// not fan out into N personas × M queries every tick. + pub db_tick_backoff: std::sync::Mutex, +} + +#[derive(Debug, Default)] +pub struct DbTickBackoff { + pub consecutive_failures: u32, + pub backoff_until: Option, } impl ChannelState { @@ -87,6 +96,7 @@ impl ChannelState { personas, self_task_generators: DashMap::new(), tick_config: std::sync::RwLock::new(ChannelTickConfig::default()), + db_tick_backoff: std::sync::Mutex::new(DbTickBackoff::default()), } } @@ -100,6 +110,7 @@ impl ChannelState { personas, self_task_generators: DashMap::new(), tick_config: std::sync::RwLock::new(ChannelTickConfig::default()), + db_tick_backoff: std::sync::Mutex::new(DbTickBackoff::default()), } } } @@ -443,6 +454,12 @@ impl ServiceModule for ChannelModule { return Ok(()); } + if (config.task_poll_enabled || config.self_task_enabled || config.training_check_enabled) + && self.should_skip_db_tick() + { + return Ok(()); + } + let executor = crate::runtime::command_executor::executor(); let mut total_enqueued = 0u32; let mut total_self_tasks = 0u32; @@ -465,20 +482,29 @@ impl ServiceModule for ChannelModule { ) .await; - if let Ok(result_json) = query_result { - if let Some(records) = result_json.get("data").and_then(|d| d.as_array()) { - for record in records { - if let Some(item) = Self::record_to_task_queue_item(record, persona_id) - { - if let Some(mut entry) = self.state.registries.get_mut(persona_id) { - let (registry, _state) = entry.value_mut(); - if registry.route(Box::new(item)).is_ok() { - total_enqueued += 1; + match query_result { + Ok(result_json) => { + if let Some(records) = result_json.get("data").and_then(|d| d.as_array()) { + for record in records { + if let Some(item) = + Self::record_to_task_queue_item(record, persona_id) + { + if let Some(mut entry) = + self.state.registries.get_mut(persona_id) + { + let (registry, _state) = entry.value_mut(); + if registry.route(Box::new(item)).is_ok() { + total_enqueued += 1; + } } } } } } + Err(e) => { + self.record_db_tick_failure(&format!("task poll failed: {e}")); + return Ok(()); + } } } @@ -514,7 +540,10 @@ impl ServiceModule for ChannelModule { } } Err(e) => { - log.warn(&format!("Self-task gen failed for {}: {}", persona_id, e)) + self.record_db_tick_failure(&format!( + "self-task gen failed for {persona_id}: {e}" + )); + return Ok(()); } } } @@ -569,24 +598,32 @@ impl ServiceModule for ChannelModule { ) .await; - if let Ok(count_json) = training_result { - let count = count_json.get("data").and_then(|v| v.as_u64()).unwrap_or(0); - - if count >= config.training_threshold { - log.info(&format!("Training threshold met for {} ({} examples), triggering genome/job-create", persona_id, count)); - let _ = crate::runtime::command_executor::execute_ts_json( - "genome/job-create", - serde_json::json!({ - "personaId": persona_id.to_string(), - "trainingExamples": count, - }), - ) - .await; + match training_result { + Ok(count_json) => { + let count = count_json.get("data").and_then(|v| v.as_u64()).unwrap_or(0); + + if count >= config.training_threshold { + log.info(&format!("Training threshold met for {} ({} examples), triggering genome/job-create", persona_id, count)); + let _ = crate::runtime::command_executor::execute_ts_json( + "genome/job-create", + serde_json::json!({ + "personaId": persona_id.to_string(), + "trainingExamples": count, + }), + ) + .await; + } + } + Err(e) => { + self.record_db_tick_failure(&format!("training check failed: {e}")); + return Ok(()); } } } } + self.record_db_tick_success(); + if total_enqueued > 0 || total_self_tasks > 0 { log.info(&format!( "Tick: {} personas, polled {} tasks, generated {} self-tasks", @@ -605,6 +642,44 @@ impl ServiceModule for ChannelModule { } impl ChannelModule { + fn should_skip_db_tick(&self) -> bool { + let Ok(backoff) = self.state.db_tick_backoff.lock() else { + return false; + }; + + backoff + .backoff_until + .map(|until| Instant::now() < until) + .unwrap_or(false) + } + + fn record_db_tick_success(&self) { + if let Ok(mut backoff) = self.state.db_tick_backoff.lock() { + backoff.consecutive_failures = 0; + backoff.backoff_until = None; + } + } + + fn record_db_tick_failure(&self, reason: &str) { + let log = crate::runtime::logger("channel-tick"); + if let Ok(mut backoff) = self.state.db_tick_backoff.lock() { + backoff.consecutive_failures = backoff.consecutive_failures.saturating_add(1); + let delay_secs = match backoff.consecutive_failures { + 1 => 60, + 2 => 120, + 3 => 300, + _ => 600, + }; + backoff.backoff_until = Some(Instant::now() + Duration::from_secs(delay_secs)); + log.warn(&format!( + "DB-backed tick disabled for {delay_secs}s after {} consecutive failure(s): {reason}", + backoff.consecutive_failures + )); + } else { + log.warn(&format!("DB-backed tick failed: {reason}")); + } + } + /// Convert a DB record (from data/query result) to a TaskQueueItem. fn record_to_task_queue_item(record: &Value, persona_id: &Uuid) -> Option { let record_id = record diff --git a/src/workers/continuum-core/src/orm/sqlite.rs b/src/workers/continuum-core/src/orm/sqlite.rs index a823f0504..532221e4a 100644 --- a/src/workers/continuum-core/src/orm/sqlite.rs +++ b/src/workers/continuum-core/src/orm/sqlite.rs @@ -252,6 +252,18 @@ fn evolve_table_schema(conn: &Connection, table: &str, data: &Value) -> bool { added > 0 } +fn projection_dummy(select: &Option>) -> Option { + let cols = select.as_ref()?; + if cols.is_empty() { + return None; + } + let mut dummy = serde_json::Map::new(); + for col in cols { + dummy.insert(col.clone(), Value::Null); + } + Some(Value::Object(dummy)) +} + fn do_create(conn: &Connection, record: DataRecord) -> StorageResult { let table = naming::to_table_name(&record.collection); let now = chrono::Utc::now().to_rfc3339(); @@ -956,6 +968,25 @@ impl StorageAdapter for SqliteAdapter { } async fn query(&self, query: StorageQuery) -> StorageResult> { + if let Some(dummy) = projection_dummy(&query.select) { + let writer = match self.get_writer() { + Ok(c) => c, + Err(e) => return StorageResult::err(e), + }; + let table = naming::to_table_name(&query.collection); + let ensure_result = tokio::task::spawn_blocking(move || { + let conn = writer.lock().unwrap(); + ensure_table_exists(&conn, &table, &dummy)?; + evolve_table_schema(&conn, &table, &dummy); + Ok::<(), String>(()) + }) + .await + .unwrap_or_else(|e| Err(format!("spawn_blocking failed: {}", e))); + if let Err(e) = ensure_result { + return StorageResult::err(e); + } + } + let conn = match self.get_reader() { Ok(c) => c, Err(e) => return StorageResult::err(e), @@ -1331,4 +1362,43 @@ mod tests { assert!(query_result.success); assert_eq!(query_result.data.unwrap().len(), 10); } + + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] + async fn test_query_projection_evolves_missing_columns_before_select() { + let (adapter, _dir) = setup_adapter().await; + + adapter + .ensure_schema(CollectionSchema { + collection: "recipes".to_string(), + fields: vec![super::super::types::SchemaField { + name: "displayName".to_string(), + field_type: super::super::types::FieldType::String, + indexed: false, + unique: false, + nullable: false, + max_length: None, + }], + indexes: vec![], + }) + .await; + + let result = adapter + .query(StorageQuery { + collection: "recipes".to_string(), + select: Some(vec![ + "displayName".to_string(), + "team".to_string(), + "modes".to_string(), + ]), + limit: Some(10), + ..Default::default() + }) + .await; + + assert!( + result.success, + "projection query should evolve missing selected columns: {:?}", + result.error + ); + } } diff --git a/src/workers/continuum-core/src/persona/self_task_generator.rs b/src/workers/continuum-core/src/persona/self_task_generator.rs index 96f93d73a..52df07122 100644 --- a/src/workers/continuum-core/src/persona/self_task_generator.rs +++ b/src/workers/continuum-core/src/persona/self_task_generator.rs @@ -115,7 +115,7 @@ impl SelfTaskGenerator { } } } - Err(e) => log.warn(&format!("Unfinished work detection failed: {e}")), + Err(e) => return Err(format!("unfinished work detection failed: {e}")), } // 4. Learning opportunities (failed tasks) @@ -130,7 +130,7 @@ impl SelfTaskGenerator { } } } - Err(e) => log.warn(&format!("Learning opportunity detection failed: {e}")), + Err(e) => return Err(format!("learning opportunity detection failed: {e}")), } Ok(created_tasks) diff --git a/src/workers/start-workers.sh b/src/workers/start-workers.sh index 498e189a6..5d9389ac4 100755 --- a/src/workers/start-workers.sh +++ b/src/workers/start-workers.sh @@ -9,6 +9,7 @@ RED='\033[0;31m' NC='\033[0m' # No Color CONFIG_FILE="$(dirname "$0")/workers-config.json" +PROJECT_DIR="$(cd "$(dirname "$0")/.." && pwd)" # All data lives at $HOME/.continuum — matches SystemPaths.root in TypeScript. CONTINUUM_ROOT="${CONTINUUM_ROOT:-$HOME/.continuum}" @@ -39,6 +40,29 @@ parse_memory_limit() { esac } +default_core_memory_limit() { + local phys_mib="" + if [ "$(uname -s)" = "Darwin" ] && command -v sysctl >/dev/null 2>&1; then + phys_mib=$(sysctl -n hw.memsize 2>/dev/null | awk '{print int($1/1024/1024)}') + elif [ -f /proc/meminfo ]; then + phys_mib=$(awk '/^MemTotal:/{print int($2/1024)}' /proc/meminfo) + fi + + if [ -z "$phys_mib" ] || [ "$phys_mib" -le 0 ]; then + echo "16G" + return + fi + + local phys_gb=$((phys_mib / 1024)) + if [ "$phys_gb" -ge 32 ]; then + echo "$((phys_gb - 10))G" + elif [ "$phys_gb" -ge 20 ]; then + echo "$((phys_gb - 8))G" + else + echo "10G" + fi +} + # Source config.env to get API keys (HF_TOKEN, etc.) for workers if [ -f "$HOME/.continuum/config.env" ]; then set -a # Auto-export all variables @@ -142,9 +166,16 @@ YAML fi fi - LIVEKIT_LOG_LEVEL=info "$LIVEKIT_BIN" $LIVEKIT_EXTRA_ARGS >> "$LIVEKIT_LOG" 2>&1 & - LIVEKIT_PID=$! - disown $LIVEKIT_PID + livekit_args=() + if [ -n "$LIVEKIT_EXTRA_ARGS" ]; then + # shellcheck disable=SC2206 + livekit_args=($LIVEKIT_EXTRA_ARGS) + fi + LIVEKIT_PID=$(node "$PROJECT_DIR/scripts/spawn-detached.mjs" \ + --cwd "$PROJECT_DIR" \ + --log "$LIVEKIT_LOG" \ + --env LIVEKIT_LOG_LEVEL=info \ + -- "$LIVEKIT_BIN" "${livekit_args[@]}") # Wait for LiveKit to be ready (port 7880) for i in {1..20}; do @@ -231,6 +262,9 @@ while read -r worker; do worker_type=$(echo "$worker" | jq -r '.type // "socket"') description=$(echo "$worker" | jq -r '.description') mem_limit=$(echo "$worker" | jq -r '.memoryLimit // empty') + if [ "$name" = "continuum-core" ] && [ -z "$mem_limit" ]; then + mem_limit="${CONTINUUM_CORE_MEM:-$(default_core_memory_limit)}" + fi # Get args array (may be empty) — resolve .continuum paths to absolute args=$(echo "$worker" | jq -r '.args[]?' | while read -r arg; do resolve_path "$arg"; done || echo "") @@ -244,16 +278,18 @@ while read -r worker; do # ulimit -v: only enforce on macOS. Linux enforces strictly and CUDA/WebRTC # need far more virtual memory than the configured limit allows. - ULIMIT_CMD="" + spawn_memory_args=() if [ "$(uname -s)" = "Darwin" ]; then - ULIMIT_CMD="ulimit -v $MEM_LIMIT_KB 2>/dev/null || true;" + spawn_memory_args=(--ulimit-v-kb "$MEM_LIMIT_KB") fi if [ "$worker_type" = "tcp" ]; then # TCP worker (e.g., gRPC server) - no socket argument - (eval "$ULIMIT_CMD" exec "$binary") >> "$CONTINUUM_ROOT/jtag/logs/system/${name}.log" 2>&1 & - WORKER_PID=$! - disown $WORKER_PID + WORKER_PID=$(node "$PROJECT_DIR/scripts/spawn-detached.mjs" \ + --cwd "$PROJECT_DIR" \ + --log "$CONTINUUM_ROOT/jtag/logs/system/${name}.log" \ + "${spawn_memory_args[@]}" \ + -- "$binary") # Wait for TCP port to be listening for i in {1..40}; do @@ -270,19 +306,18 @@ while read -r worker; do done else # Unix socket worker - each gets its own log file for better segregation - if [ -z "$args" ]; then - (eval "$ULIMIT_CMD" exec "$binary" "$socket") >> "$CONTINUUM_ROOT/jtag/logs/system/${name}.log" 2>&1 & - else - # Convert newline-separated args to array - arg_array=() + arg_array=() + if [ -n "$args" ]; then while IFS= read -r arg; do arg_array+=("$arg") done <<< "$args" - (eval "$ULIMIT_CMD" exec "$binary" "$socket" "${arg_array[@]}") >> "$CONTINUUM_ROOT/jtag/logs/system/${name}.log" 2>&1 & fi - WORKER_PID=$! - disown $WORKER_PID # Fully detach from shell + WORKER_PID=$(node "$PROJECT_DIR/scripts/spawn-detached.mjs" \ + --cwd "$PROJECT_DIR" \ + --log "$CONTINUUM_ROOT/jtag/logs/system/${name}.log" \ + "${spawn_memory_args[@]}" \ + -- "$binary" "$socket" "${arg_array[@]}") # Wait for socket to be created (30s timeout) for i in {1..60}; do From bc5e69b10701d6ed664180ece0f09df14537771d Mon Sep 17 00:00:00 2001 From: Test Date: Thu, 7 May 2026 19:10:09 -0500 Subject: [PATCH 2/2] Architect local Qwen persona runtime --- docs/planning/ALPHA-GAP-ANALYSIS.md | 63 ++- .../server/AIProvidersStatusServerCommand.ts | 4 +- .../chat/poll/server/ChatPollServerCommand.ts | 75 ++-- .../chat/poll/shared/ChatPollTypes.ts | 15 +- src/scripts/minimal-server-template.ts | 16 +- src/scripts/seed-continuum.ts | 29 +- src/scripts/seed/personas.ts | 128 +++--- src/shared/ModelRegistry.ts | 3 +- src/shared/workers/PersonaWorkerThread.ts | 10 +- src/shared/workers/persona-worker.ts | 68 +-- src/system/adapters/IAdapterProvider.ts | 8 +- src/system/adapters/LocalAdapterProvider.ts | 24 +- src/system/ai/server/AIDecisionService.ts | 14 +- .../server/InferenceCoordinator.ts | 5 +- .../orchestration/SystemOrchestrator.ts | 85 ++-- .../rag/sources/CodebaseSearchSource.ts | 53 ++- .../rag/sources/ConversationHistorySource.ts | 67 +-- .../rag/sources/conversationHistoryPoison.ts | 58 +++ .../test/unit/CodebaseSearchSource.test.ts | 51 +++ .../unit/ConversationHistorySource.test.ts | 27 ++ src/system/secrets/SecretManager.ts | 52 ++- src/system/shared/Constants.ts | 77 +--- src/system/shared/ModelCapabilities.ts | 8 +- src/system/shared/ModelRegistry.ts | 8 +- .../user/server/PersonaLifecycleManager.ts | 2 +- src/system/user/server/PersonaUser.ts | 55 ++- .../user/server/modules/PersonaGenome.ts | 3 +- .../server/modules/PersonaTaskExecutor.ts | 2 +- .../user/server/modules/ProgressiveScorer.ts | 5 +- .../modules/cognition/PeerReviewTypes.ts | 6 +- .../modules/cognition/adapters/LLMAdapter.ts | 8 +- .../integration/PersonaUser-Lifecycle.test.ts | 4 +- src/workers/continuum-core/config/models.toml | 6 - .../continuum-core/config/providers.toml | 2 +- src/workers/continuum-core/src/ai/adapter.rs | 117 ++++- .../src/inference/candle_adapter.rs | 250 +---------- .../src/inference/llamacpp_adapter.rs | 11 +- .../continuum-core/src/inference/model.rs | 9 +- .../continuum-core/src/inference/quantized.rs | 4 +- .../src/model_registry/artifacts.rs | 412 ++++++++++++++++++ .../src/model_registry/loader.rs | 169 ++++--- .../continuum-core/src/model_registry/mod.rs | 5 + .../src/model_registry/types.rs | 11 +- .../continuum-core/src/modules/ai_provider.rs | 3 +- .../continuum-core/src/persona/allocator.rs | 158 ++++--- .../continuum-core/src/persona/catalog.json | 39 +- .../continuum-core/src/persona/evaluator.rs | 74 +++- src/workers/continuum-core/src/secrets.rs | 27 +- 48 files changed, 1437 insertions(+), 893 deletions(-) create mode 100644 src/system/rag/sources/conversationHistoryPoison.ts create mode 100644 src/system/rag/test/unit/CodebaseSearchSource.test.ts create mode 100644 src/system/rag/test/unit/ConversationHistorySource.test.ts create mode 100644 src/workers/continuum-core/src/model_registry/artifacts.rs diff --git a/docs/planning/ALPHA-GAP-ANALYSIS.md b/docs/planning/ALPHA-GAP-ANALYSIS.md index 789b73b51..f654d6502 100644 --- a/docs/planning/ALPHA-GAP-ANALYSIS.md +++ b/docs/planning/ALPHA-GAP-ANALYSIS.md @@ -34,6 +34,7 @@ The non-negotiable gates: | Docker | Too much historical bulk and mixed responsibility; several open Docker issues remain | Docker can mask failures and slow iteration | | Rust core | Strong core exists, but GPU lifecycle, paging, and persona runtime boundaries are still incomplete | Core instability can make UI/Node fixes irrelevant | | Node/TS | Still owns too much cognition/command behavior | Adds latency, GC/IPC complexity, and harder cross-platform reuse | +| Config/secrets | `$HOME/.continuum/config.env` is the local source of truth, but empty placeholders and per-process loading have caused false provider availability | Cloud providers can steal local turns and fail; grid nodes cannot yet receive encrypted config consistently | | Tests | Many tests exist, but the alpha loop still overuses `npm start`/browser/Docker as proof | Slow tests hide root causes and discourage TDD | ## Issue-Driven Workstreams @@ -75,6 +76,30 @@ Implementation posture: - If build is unavoidable, make it explicit and resumable. - Install health must distinguish: network unavailable, Docker unavailable, GPU unavailable, model unavailable, Rust core unavailable, UI unavailable. +### 1A. Config, Secrets, And Grid Propagation + +**Goal**: one authoritative config path per node, explicit encrypted propagation across trusted grid nodes, and no false "configured" state from empty placeholders. + +| Issue | Priority | Direction | Test gate | +|---|---:|---|---| +| file: config single-source issue | P0 | `SecretManager` and Rust `secrets.rs` must treat only non-empty values as configured and must lazy-load `$HOME/.continuum/config.env` before any provider check | provider status shows cloud unavailable for empty placeholders; local chat still works | +| file: `grid/config/sync` command issue | P0 | create a command pair for encrypted config sharing over trusted grid/Tailscale nodes; no loose file copying and no browser exposure | two-node test shares selected keys, decrypts only on trusted target, and never logs values | +| #860 config.env as directory | P1 | keep setup file/dir creation idempotent and typed | setup test catches file-vs-dir mismatch | + +Command shape: + +- `grid/config/status`: list configured key names, source path, empty placeholders, and target-node drift without values. +- `grid/config/export`: encrypt selected config keys for a specific trusted node identity. +- `grid/config/import`: decrypt and merge selected keys into the target node's `$HOME/.continuum/config.env`. +- `grid/config/sync`: orchestrate export/import across trusted grid nodes and report per-node success. + +Rules: + +- Empty placeholders such as `DEEPSEEK_API_KEY=` are documentation, not availability. +- Local mode must work with zero API keys. +- Cloud personas are eligible only when their required key is non-empty and the provider health check is not expired/failed. +- Config sharing is an owner/trusted-node command. It should use grid identity plus transport encryption, then persist through `SecretManager` so all runtimes see one source. + ### 2. GPU Runtime Stability **Goal**: GPU resource failures degrade or recover; they do not brick the session. @@ -141,6 +166,31 @@ Near-term PR sequence: | #944 embedding loop/cache misses | P1 | migrate embedding cache to shared paging primitive | repeated index pass has cache hits and bounded memory | | #911 16GB MacBook Air | P1 | define reduced alpha profile with strict budgets | 16GB profile starts and reports disabled features honestly | +Model selection contract: + +- Callers request capabilities, not model IDs. +- Discovery and admission are separate: discovery builds the catalog of model + artifacts, modalities, context windows, templates, quantizations, and backend + requirements; admission chooses the best viable candidate for the current + machine state and request. +- The catalog is a curated whitelist, not arbitrary Hugging Face passthrough. + Candidate discovery may crawl/search HF offline or through foundry commands, + but runtime selection only admits vetted rows with known templates, license, + backend compatibility, memory estimates, modality metadata, and forge status. +- Foundry output flows back into the same registry: `candidate` -> `vetted` -> + `forged` -> `published`, with Sentinel/foundry jobs updating metadata rather + than TS code hardcoding new model names. +- Provider identity must be typed. Runtime local chat is `LocalRuntime` + (llama.cpp/Qwen through our adapter stack), cloud providers are explicit + external identities, and Candle is not an inference provider for persona chat. + Export this with `ts-rs` so TS seed/config/user paths cannot invent free-form + provider strings. +- Request fields should be typed: `taskKind`, `minIntelligence`, `modalities`, `toolSupport`, `minContextTokens`, `latencyClass`, `qualityClass`, `memoryBudget`, `gpuRequired`, `familyAllowlist`, `familyPreference`, and `explicitOverride`. +- Constraint syntax should feel like semver where it helps: exact pins for repro, `>=` for minimum intelligence/capability, `~qwen3.5` for near-family preference, ranges for context/latency/memory, and hard allow/deny lists for safety. +- Rust registry/admission returns the selected provider/model/artifact plus explanation: why selected, why alternatives were rejected, projected VRAM/RAM/KV/LoRA footprint, and whether the choice is degraded. +- Persona seed stores intent (`local-default`, `vision-default`, future typed capability refs), not hardcoded model strings. +- TS may display selection state; it must not invent fallback models. + Implementation order: 1. PressureBroker admission gate. @@ -219,12 +269,13 @@ Design rule: |---:|---|---|---|---|---| | 1 | `codex/alpha-gap-stability-plan` | `canary` | planning doc | this document; shared execution map | docs lint/readability, AIRC review | | 2 | `fix/gpu-backend-lifecycle` | `canary` | #1048, #1050, #960, #964 | mutex + backend state/recovery | Rust tests with injected failure; GPU provider evidence | -| 3 | `fix/docker-alpha-profiles` | `canary` | #892, #955, #834, #776, #796 | modular Docker profile cleanup | compose profile smoke; image size report | -| 4 | `feature/persona-rust-replay` | `canary` | #969, #909 | Rust persona replay/tool-loop foundation | `cargo test`; net-negative TS cognition lines | -| 5 | `feature/pressure-broker-gate` | `canary` | #1049, #1051, #945, #944 | admission gate + first resource consumer | memory/load tests; no Node required | -| 6 | `fix/realtime-core-reconnect` | `canary` | #793, #794, #773 | core restart + realtime browser recovery | kill core, command recovers, browser receives AI message | -| 7 | `feature/airc-persona-peer` | `canary` | #967, PR #1046 | Continuum persona as AIRC participant | AIRC -> Continuum -> AIRC round trip | -| 8 | `test/fresh-install-e2e` | `canary` | #770, #1006-#1008, #983 | install validation matrix | Mac + Windows logs; no silent waits | +| 3 | `feature/grid-config-sync` | `canary` | config single-source, grid config sync | encrypted config status/export/import/sync commands | two-node encrypted config sync; provider status remains truthful | +| 4 | `fix/docker-alpha-profiles` | `canary` | #892, #955, #834, #776, #796 | modular Docker profile cleanup | compose profile smoke; image size report | +| 5 | `feature/persona-rust-replay` | `canary` | #969, #909 | Rust persona replay/tool-loop foundation | `cargo test`; net-negative TS cognition lines | +| 6 | `feature/pressure-broker-gate` | `canary` | #1049, #1051, #945, #944 | admission gate + first resource consumer | memory/load tests; no Node required | +| 7 | `fix/realtime-core-reconnect` | `canary` | #793, #794, #773 | core restart + realtime browser recovery | kill core, command recovers, browser receives AI message | +| 8 | `feature/airc-persona-peer` | `canary` | #967, PR #1046 | Continuum persona as AIRC participant | AIRC -> Continuum -> AIRC round trip | +| 9 | `test/fresh-install-e2e` | `canary` | #770, #1006-#1008, #983 | install validation matrix | Mac + Windows logs; no silent waits | This order can change when a blocker is discovered, but changes must be made in this document and on the issue/PR thread, not only in chat. diff --git a/src/commands/ai/providers/status/server/AIProvidersStatusServerCommand.ts b/src/commands/ai/providers/status/server/AIProvidersStatusServerCommand.ts index 2d03da4f6..116fcdef3 100644 --- a/src/commands/ai/providers/status/server/AIProvidersStatusServerCommand.ts +++ b/src/commands/ai/providers/status/server/AIProvidersStatusServerCommand.ts @@ -146,8 +146,8 @@ export class AIProvidersStatusServerCommand extends AIProvidersStatusCommand { // positive isConfigured=true for every fresh install, leading users to // attempt chat and hit an opaque 401. Check the actual value length // instead. (#980 Bug 5.) - const rawKey = config.category === 'local' ? undefined : secrets.get(config.key); - const isConfigured = config.category === 'local' ? true : (rawKey?.length ?? 0) > 0; + const rawKey = config.category === 'local' ? undefined : secrets.get(config.key, 'AIProvidersStatusServerCommand'); + const isConfigured = config.category === 'local' ? true : (rawKey?.trim().length ?? 0) > 0; return { provider: config.provider, diff --git a/src/commands/collaboration/chat/poll/server/ChatPollServerCommand.ts b/src/commands/collaboration/chat/poll/server/ChatPollServerCommand.ts index a5378842c..0cb8319ec 100644 --- a/src/commands/collaboration/chat/poll/server/ChatPollServerCommand.ts +++ b/src/commands/collaboration/chat/poll/server/ChatPollServerCommand.ts @@ -1,5 +1,5 @@ /** - * Chat Poll Server Command - Get messages after a specific messageId + * Chat Poll Server Command - Get recent messages or messages after a marker */ import type { JTAGContext } from '@system/core/types/JTAGTypes'; @@ -29,48 +29,52 @@ export class ChatPollServerCommand extends ChatPollCommand { } } - // Get the original message to find its timestamp - const originalMessageResult = await ORM.query({ - collection: 'chat_messages', - filter: { id: params.afterMessageId }, - limit: 1 - }, 'default'); + const filter: {timestamp?: {$gt: string}, roomId?: UUID} = {}; - if (!originalMessageResult.success || !originalMessageResult.data || originalMessageResult.data.length === 0) { - return { - context: params.context, - sessionId: params.sessionId, - success: false, - messages: [], - count: 0, - afterMessageId: params.afterMessageId, - timestamp: new Date().toISOString(), - error: `Message not found: ${params.afterMessageId}` - }; - } + if (params.afterMessageId) { + // Get the original message to find its timestamp. + const originalMessageResult = await ORM.query({ + collection: 'chat_messages', + filter: { id: params.afterMessageId }, + limit: 1 + }, 'default'); + + if (!originalMessageResult.success || !originalMessageResult.data || originalMessageResult.data.length === 0) { + return { + context: params.context, + sessionId: params.sessionId, + success: false, + messages: [], + count: 0, + afterMessageId: params.afterMessageId, + timestamp: new Date().toISOString(), + error: `Message not found: ${params.afterMessageId}` + }; + } - const originalMessage = originalMessageResult.data[0]; + const originalMessage = originalMessageResult.data[0]; - // Build filter for messages after this one - // Convert Date to ISO string for query comparison - const afterTimestamp = originalMessage.data.timestamp instanceof Date - ? originalMessage.data.timestamp.toISOString() - : originalMessage.data.timestamp; + // Build filter for messages after this one. + const afterTimestamp = originalMessage.data.timestamp instanceof Date + ? originalMessage.data.timestamp.toISOString() + : originalMessage.data.timestamp; - const filter: {timestamp: {$gt: string}, roomId?: UUID} = { - timestamp: { $gt: afterTimestamp } - }; + filter.timestamp = { $gt: afterTimestamp }; + } // Optional room filter (from roomId or resolved room name) if (roomId) { filter.roomId = roomId; } - // Query messages + const sortDirection = params.afterMessageId ? 'asc' : 'desc'; + + // Query messages. No afterMessageId means "latest messages"; this is + // the ergonomic smoke-test/default read path for CLI and agents. const result = await ORM.query({ collection: 'chat_messages', filter, - sort: [{ field: 'timestamp', direction: 'asc' }], + sort: [{ field: 'timestamp', direction: sortDirection }], limit: params.limit || 50 }, 'default'); @@ -87,8 +91,15 @@ export class ChatPollServerCommand extends ChatPollCommand { }; } - // Extract entity data from DataRecord[] - const messages = result.data.map(record => record.data); + // Extract entity data from DataRecord[] and normalize + // latest-mode back to chronological order for display/readability. + const messages = result.data + .map(record => record.data) + .sort((a, b) => { + const aTime = new Date(a.timestamp).getTime(); + const bTime = new Date(b.timestamp).getTime(); + return aTime - bTime; + }); return { context: params.context, diff --git a/src/commands/collaboration/chat/poll/shared/ChatPollTypes.ts b/src/commands/collaboration/chat/poll/shared/ChatPollTypes.ts index 85461074b..11a132701 100644 --- a/src/commands/collaboration/chat/poll/shared/ChatPollTypes.ts +++ b/src/commands/collaboration/chat/poll/shared/ChatPollTypes.ts @@ -1,10 +1,11 @@ /** - * Chat Poll Command Types - Get messages after a specific messageId + * Chat Poll Command Types - Get recent messages or messages after a marker * * Simple command for conversational research workflow: * 1. Send a question and get messageId - * 2. Wait for responses (sleep) - * 3. Poll for all messages after your question + * 2. Wait for responses + * 3. Poll for all messages after your question, or omit afterMessageId to + * inspect the latest messages in a room. */ import type { JTAGContext, CommandParams, JTAGPayload, CommandInput} from '@system/core/types/JTAGTypes'; @@ -21,8 +22,9 @@ export interface ChatPollParams extends CommandParams { readonly context: JTAGContext; readonly sessionId: UUID; - // Message ID to poll from (returns all messages after this one) - readonly afterMessageId: UUID; + // Optional message ID to poll from (returns messages after this one). + // When omitted, returns latest messages in the room. + readonly afterMessageId?: UUID; // Optional: limit number of messages returned readonly limit?: number; @@ -41,7 +43,7 @@ export interface ChatPollResult extends JTAGPayload { readonly success: boolean; readonly messages: ReadonlyArray; readonly count: number; - readonly afterMessageId: UUID; + readonly afterMessageId?: UUID; readonly timestamp: string; readonly error?: string; } @@ -92,4 +94,3 @@ export const createCollaborationChatPollResultFromParams = ( params: ChatPollParams, differences: Omit ): ChatPollResult => transformPayload(params, differences); - diff --git a/src/scripts/minimal-server-template.ts b/src/scripts/minimal-server-template.ts index 9c6d7dae8..f3e02b832 100644 --- a/src/scripts/minimal-server-template.ts +++ b/src/scripts/minimal-server-template.ts @@ -18,6 +18,12 @@ const PORT = connectionConfig.httpPort; import { getNetworkIdentity, getTlsOptions } from '../system/config/server/NetworkIdentity'; +function isBenignConnectionError(error: unknown): boolean { + if (!error || typeof error !== 'object') return false; + const code = (error as NodeJS.ErrnoException).code; + return code === 'EPIPE' || code === 'ECONNRESET' || code === 'ERR_STREAM_DESTROYED'; +} + class MinimalServer { private server: http.Server | https.Server; private requestInProgress = false; @@ -1259,11 +1265,19 @@ server.start().catch((error) => { // Global error handlers process.on('uncaughtException', (error) => { + if (isBenignConnectionError(error)) { + console.warn(`⚠️ Ignoring client disconnect: ${(error as Error).message}`); + return; + } console.error('🚨 Uncaught Exception:', error.message); process.exit(1); }); process.on('unhandledRejection', (reason) => { + if (isBenignConnectionError(reason)) { + console.warn(`⚠️ Ignoring client disconnect: ${reason instanceof Error ? reason.message : String(reason)}`); + return; + } console.error('🚨 Unhandled Rejection:', reason); process.exit(1); -}); \ No newline at end of file +}); diff --git a/src/scripts/seed-continuum.ts b/src/scripts/seed-continuum.ts index 0b803226e..f8054420b 100644 --- a/src/scripts/seed-continuum.ts +++ b/src/scripts/seed-continuum.ts @@ -23,7 +23,7 @@ import { TrainingSessionEntity } from '../system/data/entities/TrainingSessionEn import { ActivityEntity } from '../system/data/entities/ActivityEntity'; import { ActivityDataSeed } from '../api/data-seed/ActivityDataSeed'; import { SystemIdentity } from '../api/data-seed/SystemIdentity'; -import { PERSONA_CONFIGS, PERSONA_UNIQUE_IDS, getAvailablePersonas, selectLocalModel, type PersonaConfig } from './seed/personas'; +import { OPTIONAL_CLOUD_PERSONA_CONFIGS, PERSONA_CONFIGS, PERSONA_UNIQUE_IDS, getAvailablePersonas, selectLocalModel, type PersonaConfig } from './seed/personas'; import { DATA_COMMANDS } from '../commands/data/shared/DataCommandConstants'; import { createRoom, @@ -420,12 +420,12 @@ async function seedViaJTAG() { } } - // Seed ALL personas — existence ≠ activation. - // The allocator decides which are ACTIVE at runtime based on hardware. - // But every persona must EXIST in the DB so they're ready when resources allow. - const activePersonas: PersonaConfig[] = Object.values(PERSONA_CONFIGS); + // Seed the active default fleet. Optional cloud personas are created only + // when their real API key exists; historical rows for missing-key providers + // are marked offline below so they cannot steal local chat turns. + const activePersonas: PersonaConfig[] = getAvailablePersonas().personas; const localModel = selectLocalModel(0); // Default model, allocator overrides at runtime - console.log(`🎭 Seeding all ${activePersonas.length} personas (allocator activates at runtime)`); + console.log(`🎭 Seeding ${activePersonas.length} active persona(s)`); // BULK LOAD: One subprocess call replaces N individual lookups const { usersByUniqueId, missingUniqueIds } = await loadAllUsers(activePersonas); @@ -551,6 +551,23 @@ async function seedViaJTAG() { console.log('✅ Existing user configs updated'); } + const activePersonaIds = new Set(activePersonas.map(p => p.uniqueId)); + const optionalPersonaIds = new Set(OPTIONAL_CLOUD_PERSONA_CONFIGS.map(p => p.uniqueId)); + const staleOptionalUsers = [...usersByUniqueId.values()].filter(user => + user.uniqueId && + optionalPersonaIds.has(user.uniqueId) && + !activePersonaIds.has(user.uniqueId) && + user.status !== 'offline' + ); + if (staleOptionalUsers.length > 0) { + console.log(`🧊 Marking ${staleOptionalUsers.length} missing-key optional persona(s) offline`); + await Promise.all(staleOptionalUsers.map(user => { + const dataArg = JSON.stringify({ status: 'offline' }).replace(/'/g, `'"'"'`); + return execAsync(`./jtag ${DATA_COMMANDS.UPDATE} --collection=${UserEntity.collection} --id="${user.id}" --data='${dataArg}' --suppressEvents=true`) + .catch(() => undefined); + })); + } + // Get key user references const claudeUser = usersByUniqueId.get(PERSONA_UNIQUE_IDS.CLAUDE) ?? null; const helperPersona = usersByUniqueId.get(PERSONA_UNIQUE_IDS.HELPER) ?? null; diff --git a/src/scripts/seed/personas.ts b/src/scripts/seed/personas.ts index f0dcd047a..5b90e943f 100644 --- a/src/scripts/seed/personas.ts +++ b/src/scripts/seed/personas.ts @@ -1,15 +1,17 @@ /** * Persona Configuration - Single Source of Truth * - * All persona definitions in one place for easy maintenance. + * Active persona definitions in one place for easy maintenance. * Used by seed-continuum.ts to create persona users. * - * Hardware-aware: getAvailablePersonas() filters based on: - * - API keys present in environment (cloud providers) - * - GPU VRAM available (local candle inference) + * Alpha default: local-first. API keys unlock optional cloud capacity, but + * the default persona fleet must not depend on cloud providers or seed random + * model families into chat. Model choice is capability-driven: personas request + * symbolic refs and the Rust registry/admission layer selects the best artifact + * that fits hardware, VRAM/unified-memory pressure, LoRA paging, and task recipe. * * uniqueId format: Simple slug WITHOUT @ prefix - * Examples: claude, helper, grok, sentinel + * Examples: helper, teacher, codereview * * The @ symbol is ONLY for UI mentions, NOT part of uniqueId */ @@ -18,6 +20,7 @@ import { generateUniqueId } from '../../system/data/utils/UniqueIdUtils'; import { LOCAL_MODELS } from '../../system/shared/Constants'; import { SYMBOLIC_REFS } from '../../shared/ModelRegistry'; import { execSync } from 'child_process'; +import { SecretManager } from '../../system/secrets/SecretManager'; export interface PersonaConfig { uniqueId: string; @@ -36,7 +39,7 @@ export interface PersonaConfig { // drift entirely. isAudioNative?: boolean; // True if model supports direct audio I/O (no STT/TTS needed) apiKeyEnv?: string; // Environment variable name for the API key (e.g., 'ANTHROPIC_API_KEY') - minVramGB?: number; // Minimum VRAM in GB for local inference (candle provider) + minVramGB?: number; // Minimum memory budget in GB for local inference admission } /** @@ -51,35 +54,16 @@ export interface PersonaConfig { * Selected speakers for variety: some male, some female, different pitches/cadences */ export const PERSONA_CONFIGS: PersonaConfig[] = [ - // Core agents (cloud — need API key) - { uniqueId: generateUniqueId('Claude'), displayName: 'Claude Code', provider: 'anthropic', type: 'agent', voiceId: '10', apiKeyEnv: 'ANTHROPIC_API_KEY' }, - { uniqueId: generateUniqueId('General'), displayName: 'General AI', provider: 'anthropic', type: 'agent', voiceId: '25', apiKeyEnv: 'ANTHROPIC_API_KEY' }, - - // Local personas (Candle native Rust inference — need GPU VRAM) - // Model sizes: 14B coder ~9GB, 8B instruct ~5GB, 3B instruct ~3GB - // On big GPUs (5090 32GB), we run specialized models per persona - // On small GPUs (8GB), everyone shares the 3B model - // Local personas: NO provider hardcode. The Rust AdapterRegistry routes - // by honest model availability: DMR (Metal on Mac, CUDA on Linux/Nvidia) - // when the model is pulled, llama-vulkan for other GPU hardware, hard - // error if neither is available. Never silent Candle-CPU fallback. - // 4B GGUF is the universal default — fits every supported machine, fast - // on Metal/Vulkan/CUDA. Power users upgrade to 27B manually (HF-gated). + // Local personas. No cloud by default. + // Local personas request capability, not an engine. Rust admission resolves + // provider:local into the best available Qwen/llama.cpp runtime for this + // host, with a hard error when no supported local runtime exists. Never + // silently fall back to a CPU-only chat path. { uniqueId: generateUniqueId('Helper'), displayName: 'Helper AI', provider: 'local', type: 'persona', voiceId: '50', minVramGB: 3, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, { uniqueId: generateUniqueId('Teacher'), displayName: 'Teacher AI', provider: 'local', type: 'persona', voiceId: '75', minVramGB: 5, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, { uniqueId: generateUniqueId('CodeReview'), displayName: 'CodeReview AI', provider: 'local', type: 'persona', voiceId: '100', minVramGB: 5, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, - - // Cloud provider personas (each needs its own API key) - { uniqueId: generateUniqueId('DeepSeek'), displayName: 'DeepSeek Assistant', provider: 'deepseek', type: 'persona', voiceId: '125', apiKeyEnv: 'DEEPSEEK_API_KEY' }, - { uniqueId: generateUniqueId('Groq'), displayName: 'Groq Lightning', provider: 'groq', type: 'persona', voiceId: '150', apiKeyEnv: 'GROQ_API_KEY' }, - { uniqueId: generateUniqueId('Claude Assistant'), displayName: 'Claude Assistant', provider: 'anthropic', type: 'persona', voiceId: '175', apiKeyEnv: 'ANTHROPIC_API_KEY' }, - { uniqueId: generateUniqueId('GPT'), displayName: 'GPT Assistant', provider: 'openai', type: 'persona', voiceId: '200', apiKeyEnv: 'OPENAI_API_KEY' }, - { uniqueId: generateUniqueId('Grok'), displayName: 'Grok', provider: 'xai', type: 'persona', voiceId: '220', apiKeyEnv: 'XAI_API_KEY' }, - { uniqueId: generateUniqueId('Together'), displayName: 'Together Assistant', provider: 'together', type: 'persona', voiceId: '30', apiKeyEnv: 'TOGETHER_API_KEY' }, - { uniqueId: generateUniqueId('Fireworks'), displayName: 'Fireworks AI', provider: 'fireworks', type: 'persona', voiceId: '60', apiKeyEnv: 'FIREWORKS_API_KEY' }, { uniqueId: generateUniqueId('Local'), displayName: 'Local Assistant', provider: 'local', type: 'persona', voiceId: '90', minVramGB: 4, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, { uniqueId: generateUniqueId('Sentinel'), displayName: 'Sentinel', provider: 'sentinel', type: 'persona', voiceId: '240' }, - { uniqueId: generateUniqueId('Gemini'), displayName: 'Gemini', provider: 'google', type: 'persona', voiceId: '115', apiKeyEnv: 'GOOGLE_API_KEY' }, // Native vision persona — local, free, no API key. Bound to // qwen2-vl-7b-instruct via the in-process llamacpp adapter (registered @@ -119,25 +103,21 @@ export const PERSONA_CONFIGS: PersonaConfig[] = [ // when the architecture supports concurrent mtmd backends safely. // See LIVE-VIDEO-CHAT-ARCHITECTURE.md for the design that lands this. - // Audio-native personas (need specific API keys) - { - uniqueId: generateUniqueId('Qwen3-Omni'), - displayName: 'Qwen3-Omni', - provider: 'alibaba', - type: 'persona', - modelId: 'qwen3-omni-flash-realtime', - isAudioNative: true, - apiKeyEnv: 'DASHSCOPE_API_KEY', - }, - { - uniqueId: generateUniqueId('Gemini-Live'), - displayName: 'Gemini Live', - provider: 'google', - type: 'persona', - modelId: 'gemini-2.5-flash-native-audio-preview', - isAudioNative: true, - apiKeyEnv: 'GOOGLE_API_KEY', - }, +]; + +export const OPTIONAL_CLOUD_PERSONA_CONFIGS: PersonaConfig[] = [ + { uniqueId: generateUniqueId('Claude'), displayName: 'Claude Code', provider: 'anthropic', type: 'agent', voiceId: '10', apiKeyEnv: 'ANTHROPIC_API_KEY' }, + { uniqueId: generateUniqueId('General'), displayName: 'General AI', provider: 'anthropic', type: 'agent', voiceId: '25', apiKeyEnv: 'ANTHROPIC_API_KEY' }, + { uniqueId: generateUniqueId('DeepSeek'), displayName: 'DeepSeek Assistant', provider: 'deepseek', type: 'persona', voiceId: '125', apiKeyEnv: 'DEEPSEEK_API_KEY' }, + { uniqueId: generateUniqueId('Groq'), displayName: 'Groq Lightning', provider: 'groq', type: 'persona', voiceId: '150', apiKeyEnv: 'GROQ_API_KEY' }, + { uniqueId: generateUniqueId('Claude Assistant'), displayName: 'Claude Assistant', provider: 'anthropic', type: 'persona', voiceId: '175', apiKeyEnv: 'ANTHROPIC_API_KEY' }, + { uniqueId: generateUniqueId('GPT'), displayName: 'GPT Assistant', provider: 'openai', type: 'persona', voiceId: '200', apiKeyEnv: 'OPENAI_API_KEY' }, + { uniqueId: generateUniqueId('Grok'), displayName: 'Grok', provider: 'xai', type: 'persona', voiceId: '220', apiKeyEnv: 'XAI_API_KEY' }, + { uniqueId: generateUniqueId('Together'), displayName: 'Together Assistant', provider: 'together', type: 'persona', voiceId: '30', apiKeyEnv: 'TOGETHER_API_KEY' }, + { uniqueId: generateUniqueId('Fireworks'), displayName: 'Fireworks AI', provider: 'fireworks', type: 'persona', voiceId: '60', apiKeyEnv: 'FIREWORKS_API_KEY' }, + { uniqueId: generateUniqueId('Gemini'), displayName: 'Gemini', provider: 'google', type: 'persona', voiceId: '115', apiKeyEnv: 'GOOGLE_API_KEY' }, + { uniqueId: generateUniqueId('Qwen3-Omni'), displayName: 'Qwen3-Omni', provider: 'alibaba', type: 'persona', modelId: 'qwen3-omni-flash-realtime', isAudioNative: true, apiKeyEnv: 'DASHSCOPE_API_KEY' }, + { uniqueId: generateUniqueId('Gemini-Live'), displayName: 'Gemini Live', provider: 'google', type: 'persona', modelId: 'gemini-2.5-flash-native-audio-preview', isAudioNative: true, apiKeyEnv: 'GOOGLE_API_KEY' }, ]; /** @@ -205,7 +185,7 @@ function detectGpu(): GpuInfo { return { vramGB: 0, device: 'CPU', type: 'cpu' }; } -/** Get total system RAM in GB — used for CPU inference budget when no GPU */ +/** Get total system RAM in GB — used for local-runtime admission hints when no GPU is visible */ function getSystemRamGB(): number { const run = (cmd: string): string | null => { try { return execSync(cmd, { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }).trim(); } @@ -224,25 +204,26 @@ function getSystemRamGB(): number { } /** - * Filter PERSONA_CONFIGS to only personas that can actually run on this hardware. + * Filter persona configs to only personas that can actually run on this node. * * Rules: - * - Cloud personas: created only if their API key is set in environment - * - Local (candle) personas: created only if GPU has enough VRAM + * - Cloud personas: created only if their API key is present and non-empty + * - Local personas: created only if this node has enough VRAM/unified/RAM budget * - Sentinel: created only if SENTINEL_PATH is set - * - No API key + no GPU = at minimum create Helper AI with candle fallback (CPU mode) + * - No API key + no GPU = at minimum seed Helper AI so the UI is explainable * * Returns the filtered list and a summary of what was included/excluded. */ /** - * Select the best local model for this hardware's VRAM budget. - * Returns HuggingFace model ID suitable for Candle inference. + * Select the symbolic local model family for this hardware's memory budget. + * + * This is a seed-time hint only. Concrete artifact selection belongs in the + * Rust model registry/admission layer because that code owns GPU pressure, + * context/KV cost, LoRA paging, and backend availability. * * Budget logic (per persona, after system reserve): - * 32GB+ CUDA → 14B coder (BF16 if available, else GGUF Q5) - * 16-31GB → 8B instruct - * 8-15GB → 3B instruct (default) - * <8GB → 3B instruct (will be slow but works) + * 16GB+ → Qwen3.5 forged family, larger quant/variant if available + * <16GB → Qwen3.5 forged family, compact quant */ export function selectLocalModel(vramGB: number): string { // Use our forged Qwen models — the whole point of the forge pipeline @@ -254,6 +235,7 @@ export function selectLocalModel(vramGB: number): string { export function getAvailablePersonas(): { personas: PersonaConfig[]; summary: string[]; gpu: GpuInfo } { const gpu = detectGpu(); + const secrets = SecretManager.getInstance(); const vramGB = gpu.vramGB; const summary: string[] = []; const available: PersonaConfig[] = []; @@ -267,10 +249,12 @@ export function getAvailablePersonas(): { personas: PersonaConfig[]; summary: st summary.push(`${gpu.device}: ${vramGB > 0 ? `${vramGB}GB ${gpu.type.toUpperCase()} (${usableVram}GB usable after ${vramReserve}GB system reserve)` : 'no GPU detected (CPU-only)'}`); - for (const persona of PERSONA_CONFIGS) { + const candidates = [...PERSONA_CONFIGS, ...OPTIONAL_CLOUD_PERSONA_CONFIGS]; + + for (const persona of candidates) { // Sentinel: special case if (persona.provider === 'sentinel') { - if (process.env.SENTINEL_PATH) { + if (secrets.has('SENTINEL_PATH')) { available.push(persona); } else { skipped.push(`${persona.displayName} (SENTINEL_PATH not set)`); @@ -278,10 +262,12 @@ export function getAvailablePersonas(): { personas: PersonaConfig[]; summary: st continue; } - // Local candle inference: check available memory (VRAM or system RAM) - // In Docker / CPU mode, Metal/CUDA aren't available — Candle uses system RAM. - // A 4B Q4_K_M model needs ~3GB regardless of whether it's in VRAM or RAM. - if (persona.provider === 'candle') { + // Local inference: check available memory (VRAM/unified memory or system RAM). + // This is an admission hint only. Concrete model/artifact choice stays + // behind modelRef + Rust registry selection. + // In Docker / non-GPU mode, this is only an admission hint. The Rust + // registry decides whether a supported local runtime can actually serve it. + if (persona.provider === 'local') { const needed = persona.minVramGB ?? 4; // Use VRAM if available, otherwise fall back to system RAM const effectiveMemory = usableVram > 0 ? usableVram : getSystemRamGB() - 4; // 4GB reserve for OS + Docker @@ -289,7 +275,7 @@ export function getAvailablePersonas(): { personas: PersonaConfig[]; summary: st available.push(persona); vramAllocated += needed; if (usableVram === 0) { - summary.push(`${persona.displayName}: CPU inference (${needed}GB RAM)`); + summary.push(`${persona.displayName}: local runtime pending (${needed}GB RAM budget)`); } } else { skipped.push(`${persona.displayName} (needs ${needed}GB, ${effectiveMemory - vramAllocated}GB left)`); @@ -299,10 +285,10 @@ export function getAvailablePersonas(): { personas: PersonaConfig[]; summary: st // Cloud providers: check API key if (persona.apiKeyEnv) { - if (process.env[persona.apiKeyEnv]) { + if (secrets.has(persona.apiKeyEnv)) { available.push(persona); } else { - skipped.push(`${persona.displayName} (${persona.apiKeyEnv} not set)`); + skipped.push(`${persona.displayName} (${persona.apiKeyEnv} not configured)`); } continue; } @@ -312,12 +298,12 @@ export function getAvailablePersonas(): { personas: PersonaConfig[]; summary: st } // Zero personas = broken UX. Always seed at least Helper AI so the user - // sees a living system. CPU inference is slow but functional. + // sees which local runtime/config is missing. if (available.length === 0) { const helper = PERSONA_CONFIGS.find(p => p.displayName === 'Helper AI'); if (helper) { available.push(helper); - summary.push('No GPU/API keys — seeding Helper AI for CPU inference (slow but functional)'); + summary.push('No GPU/API keys — seeding Helper AI for local-runtime diagnostics'); } } diff --git a/src/shared/ModelRegistry.ts b/src/shared/ModelRegistry.ts index 128b4175d..89fa6e6e1 100644 --- a/src/shared/ModelRegistry.ts +++ b/src/shared/ModelRegistry.ts @@ -3,8 +3,7 @@ * * ALL model lookups go through here. Consumers: * - src/scripts/seed/personas.ts (resolves persona.modelRef → current modelId) - * - src/daemons/ai-provider-daemon/adapters/candle/CandleAdapter.ts - * (accepts symbolic refs, resolves to concrete model) + * - Rust local runtime/admission code (accepts symbolic refs, resolves to concrete model) * - src/scripts/download-models.sh (reads via jq for tier/auto_download set) * - install.sh (reads via jq for PERSONA_MODEL tier resolution) * diff --git a/src/shared/workers/PersonaWorkerThread.ts b/src/shared/workers/PersonaWorkerThread.ts index 5ba1c5c84..4e984db40 100644 --- a/src/shared/workers/PersonaWorkerThread.ts +++ b/src/shared/workers/PersonaWorkerThread.ts @@ -9,7 +9,8 @@ * * Phase 1: Skeleton implementation (ping-pong only) * Phase 2: Add message evaluation - * Phase 3: Add real Candle inference + * Phase 3: Runtime gate comes from Rust fullEvaluate; this worker remains a + * lightweight fallback and must not initialize local inference backends. */ import { Worker } from 'worker_threads'; @@ -41,7 +42,7 @@ interface ProviderConfig { } interface WorkerConfig { - providerType?: 'candle' | 'local' | 'openai' | 'anthropic' | 'mock'; + providerType?: 'local' | 'openai' | 'anthropic' | 'mock'; providerConfig?: ProviderConfig; } @@ -54,10 +55,9 @@ interface WorkerConfig { * const latency = await worker.ping(); // Test communication * await worker.shutdown(); // Clean termination * - * Phase 3 Usage (with provider config): + * Runtime usage: * const worker = new PersonaWorkerThread('persona-id-123', { - * providerType: 'candle', - * providerConfig: { model: 'llama3.2:1b' } + * providerType: 'local' * }); */ export class PersonaWorkerThread extends EventEmitter { diff --git a/src/shared/workers/persona-worker.ts b/src/shared/workers/persona-worker.ts index a35143627..902278869 100644 --- a/src/shared/workers/persona-worker.ts +++ b/src/shared/workers/persona-worker.ts @@ -7,14 +7,13 @@ * * Phase 1: Skeleton (ping-pong) * Phase 2: Mock evaluation - * Phase 3: Real Candle (native Rust) inference + * Phase 3: Runtime gating delegates to Rust/heuristics. * - * NOTE: Candle is the ONLY local inference path. + * NOTE: Candle is training/auxiliary only. Local chat inference is llama.cpp/Qwen + * through the Rust runtime, not this worker. */ import { parentPort, workerData } from 'worker_threads'; -import { CandleGrpcAdapter } from '../../daemons/ai-provider-daemon/adapters/candle-grpc/shared/CandleGrpcAdapter'; -import type { BaseAIProviderAdapter } from '../../daemons/ai-provider-daemon/shared/BaseAIProviderAdapter'; if (!parentPort) { throw new Error('This file must be run as a Worker Thread'); @@ -27,19 +26,10 @@ const _providerConfig: Record = workerData.providerConfig || {} console.log(`🧵 PersonaWorker[${personaId}]: Starting...`); console.log(`🧵 PersonaWorker[${personaId}]: Provider type: ${providerType}`); -// Initialize provider (if not mock) -let provider: BaseAIProviderAdapter | null = null; - async function initializeProvider(): Promise { - // 'candle' or 'local' both use Candle - if (providerType === 'candle' || providerType === 'local') { - console.log(`🧵 PersonaWorker[${personaId}]: Initializing CandleGrpcAdapter...`); - - const adapter = new CandleGrpcAdapter(); - await adapter.initialize(); - provider = adapter; - console.log(`✅ PersonaWorker[${personaId}]: CandleGrpcAdapter initialized`); - } + // Intentionally no local model initialization here. should-respond is + // handled by Rust fullEvaluate; this worker is only a fallback heuristic + // path. Do not load Candle/llama.cpp from this thread. } // Main async initialization @@ -74,48 +64,10 @@ async function initializeProvider(): Promise { let processingTime = 0; try { - if (provider) { - // Real Candle inference (Phase 3) - console.log(`🧠 PersonaWorker[${personaId}]: Using real Candle inference...`); - - const prompt = `You are evaluating whether you should respond to a message in a conversation. - -Message: "${msg.message.content}" -Sender: ${msg.message.senderId} - -Respond with a confidence score (0.0-1.0) indicating whether you should respond. -Consider: -- Is this message directed at you or relevant to your expertise? -- Is it a test message that should be ignored? -- Would your response add value to the conversation? - -Format your response as: -CONFIDENCE: -REASONING: `; - - const result = await provider.generateText({ - messages: [ - { role: 'user', content: prompt } - ], - model: (_providerConfig.model as string) || 'llama3.2:1b', - temperature: 0.7, - maxTokens: 200 - }); - - // Parse confidence from AI response - const confidenceMatch = result.text.match(/CONFIDENCE:\s*([0-9.]+)/i); - const reasoningMatch = result.text.match(/REASONING:\s*(.+)/is); - - confidence = confidenceMatch ? parseFloat(confidenceMatch[1]) : 0.5; - confidence = Math.max(0, Math.min(1, confidence)); // Clamp 0-1 - shouldRespond = confidence > 0.5; - reasoning = reasoningMatch ? reasoningMatch[1].trim().substring(0, 200) : result.text.substring(0, 200); - - processingTime = Date.now() - startTime; - console.log(`✅ PersonaWorker[${personaId}]: Real inference complete - conf=${confidence.toFixed(2)}, took ${processingTime}ms`); - - } else { - // Smart heuristics evaluation with PersonaState integration + { + // Smart heuristics evaluation with PersonaState integration. + // This path is intentionally model-free; Rust fullEvaluate owns + // the authoritative gate in normal runtime. console.log(`🎭 PersonaWorker[${personaId}]: Using smart heuristics with state...`); const thinkTime = 100 + Math.random() * 400; diff --git a/src/system/adapters/IAdapterProvider.ts b/src/system/adapters/IAdapterProvider.ts index d2f360822..4ea6fa981 100644 --- a/src/system/adapters/IAdapterProvider.ts +++ b/src/system/adapters/IAdapterProvider.ts @@ -2,7 +2,7 @@ * Adapter Provider Interface * * Abstracts adapter operations across different backends: - * - Local (Candle) - direct LoRA weight merging + * - Local - direct LoRA weight merging against supported local model families * - Together.ai - cloud LoRA hosting * - Fireworks.ai - cloud LoRA hosting * - Replicate - custom model deployment @@ -21,9 +21,9 @@ export type ProviderType = 'local' | 'cloud-lora' | 'cloud-finetune'; * Supported base models per provider */ export interface SupportedModel { - id: string; // e.g., "meta-llama/Llama-3.2-3B-Instruct" - name: string; // e.g., "Llama 3.2 3B" - family: string; // e.g., "llama" + id: string; // e.g., "continuum-ai/qwen3.5-4b-code-forged-GGUF" + name: string; // e.g., "Qwen3.5 4B Code Forged" + family: string; // e.g., "qwen3" maxContext: number; // e.g., 128000 supportedRanks: number[]; // e.g., [8, 16, 32, 64] } diff --git a/src/system/adapters/LocalAdapterProvider.ts b/src/system/adapters/LocalAdapterProvider.ts index 4be7b74e9..c5164c00d 100644 --- a/src/system/adapters/LocalAdapterProvider.ts +++ b/src/system/adapters/LocalAdapterProvider.ts @@ -1,7 +1,7 @@ /** * Local Adapter Provider * - * Manages LoRA adapters for local inference via Candle. + * Manages LoRA adapters for local Qwen-family models. * Direct weight merging - no cloud dependencies. */ @@ -21,13 +21,13 @@ import * as path from 'path'; import { GlobalPaths } from '../core/config/SystemPaths'; /** - * Local adapter provider - Candle inference + * Local adapter provider. */ export class LocalAdapterProvider implements IAdapterProvider { readonly name = 'local'; readonly type: ProviderType = 'local'; readonly source: AdapterSource = 'local'; - readonly description = 'Local inference via Candle with direct LoRA weight merging'; + readonly description = 'Local Qwen-family adapter management with direct LoRA weight merging'; private readonly registryPath: string; private readonly client: InferenceGrpcClient; @@ -44,23 +44,23 @@ export class LocalAdapterProvider implements IAdapterProvider { async getSupportedModels(): Promise { return [ { - id: 'unsloth/Llama-3.2-3B-Instruct', - name: 'Llama 3.2 3B', - family: 'llama', + id: 'continuum-ai/qwen3.5-4b-code-forged-GGUF', + name: 'Qwen3.5 4B Code Forged', + family: 'qwen3', maxContext: 8192, supportedRanks: [1, 2, 4, 8, 16, 32, 64], }, { - id: 'meta-llama/Llama-3.2-3B-Instruct', - name: 'Llama 3.2 3B (Meta)', - family: 'llama', + id: 'continuum-ai/qwen3.5-2b-general-forged', + name: 'Qwen3.5 2B General Forged', + family: 'qwen3', maxContext: 8192, supportedRanks: [1, 2, 4, 8, 16, 32, 64], }, { - id: 'meta-llama/Llama-3.2-1B-Instruct', - name: 'Llama 3.2 1B', - family: 'llama', + id: 'Qwen/Qwen2-VL-7B-Instruct-GGUF', + name: 'Qwen2-VL 7B Instruct', + family: 'qwen2-vl', maxContext: 8192, supportedRanks: [1, 2, 4, 8, 16, 32], }, diff --git a/src/system/ai/server/AIDecisionService.ts b/src/system/ai/server/AIDecisionService.ts index f9776c49e..87e9ab3d6 100644 --- a/src/system/ai/server/AIDecisionService.ts +++ b/src/system/ai/server/AIDecisionService.ts @@ -18,6 +18,7 @@ import type { TextGenerationRequest, TextGenerationResponse } from '../../../dae import type { RAGContext } from '../../rag/shared/RAGTypes'; import { AIDecisionLogger } from './AIDecisionLogger'; import { InferenceCoordinator } from '../../coordination/server/InferenceCoordinator'; +import { LOCAL_MODELS } from '../../shared/Constants'; /** * AI Gating Decision - Result of "should I respond?" evaluation @@ -382,9 +383,9 @@ ${generatedText} } = {} ): Promise { const startTime = Date.now(); - const model = options.model ?? 'llama3.2:3b'; - const timeoutMs = options.timeoutMs ?? 180000; // 3 min for Candle inference (can be slow) - const provider = 'candle'; // Response generation uses local Candle inference + const model = options.model ?? LOCAL_MODELS.DEFAULT; + const timeoutMs = options.timeoutMs ?? 180000; // local Qwen inference can be slow under load + const provider = 'local'; // Request inference slot to prevent thundering herd const messageId = options.messageId ?? context.triggerMessage?.id ?? 'generate-' + Date.now(); @@ -409,10 +410,9 @@ ${generatedText} model, temperature: options.temperature ?? 0.7, maxTokens: options.maxTokens ?? 150, - // 'local' is the routing sentinel for "best available local GPU - // adapter" — the Rust AdapterRegistry picks llamacpp-local on - // Mac, DMR elsewhere. Previous 'candle' was the dead adapter's - // name; routing returned None and this whole path silently errored. + // 'local' is the routing sentinel for the best available local + // Qwen/llama.cpp runtime. Engine selection stays behind the Rust + // registry/admission layer. provider: 'local' }; diff --git a/src/system/coordination/server/InferenceCoordinator.ts b/src/system/coordination/server/InferenceCoordinator.ts index 5f34e0e24..a12e27923 100644 --- a/src/system/coordination/server/InferenceCoordinator.ts +++ b/src/system/coordination/server/InferenceCoordinator.ts @@ -43,8 +43,9 @@ export interface InferenceSlot { * Provider groups that share the same backend. * All providers in a group share the same slot pool. * - * CRITICAL: 'sentinel', 'candle', 'local' all route to the same - * gRPC/Candle server which processes requests serially. They MUST share slots. + * CRITICAL: legacy 'candle', 'sentinel', and 'local' all consume the same + * local-inference capacity. Runtime persona chat should request 'local'; + * 'candle' remains a compatibility key for training/legacy callers. */ const PROVIDER_GROUPS: Record = { 'sentinel': 'local-inference', diff --git a/src/system/orchestration/SystemOrchestrator.ts b/src/system/orchestration/SystemOrchestrator.ts index 3aaa094c0..9abb819da 100644 --- a/src/system/orchestration/SystemOrchestrator.ts +++ b/src/system/orchestration/SystemOrchestrator.ts @@ -163,11 +163,8 @@ export class SystemOrchestrator extends EventEmitter { browserOpened: requiredMilestones.includes(SYSTEM_MILESTONES.BROWSER_READY) }; - // TEST MODE: Generate signal and let caller handle exit - if (options.testMode) { - console.debug('🧪 Test mode - generating final system ready signal'); - await this.signaler.generateReadySignal(); - } + console.debug('📡 Generating system ready signal'); + await this.signaler.generateReadySignal(); return finalState; } @@ -192,12 +189,9 @@ export class SystemOrchestrator extends EventEmitter { const finalState = await this.verifySystemState(requiredMilestones); console.debug('🎉 Orchestration complete'); - // TEST MODE: Generate final signal after successful orchestration - if (options.testMode) { - console.debug('🧪 Test mode - generating final system ready signal'); - await this.signaler.generateReadySignal(); - console.debug('📡 Final system signal generated - ready for testing'); - } + console.debug('📡 Generating final system ready signal'); + await this.signaler.generateReadySignal(); + console.debug('📡 Final system signal generated'); return finalState; @@ -955,33 +949,7 @@ export class SystemOrchestrator extends EventEmitter { // In Docker, the widget-server container handles HTTP separately, // so skip spawning the HTTP server when JTAG_SKIP_HTTP is set. if (!process.env.JTAG_SKIP_HTTP) { - const { getActiveExamplePath } = await import('../../examples/server/ExampleConfigServer'); - const activeExamplePath = getActiveExamplePath(); - const serverScript = `${activeExamplePath}/src/minimal-server.ts`; - - console.debug(`🎯 Starting HTTP server directly: ${serverScript}`); - - this.serverProcess = spawn('npx', ['tsx', serverScript], { - cwd: activeExamplePath, - stdio: ['ignore', 'pipe', 'pipe'], - shell: false - }); - - this.serverProcess.stdout?.on('data', (data) => { - console.debug(`📄 HTTP Server: ${data.toString().trim()}`); - }); - - this.serverProcess.stderr?.on('data', (data) => { - console.debug(`⚠️ HTTP Server Error: ${data.toString().trim()}`); - }); - - this.serverProcess.on('error', (error) => { - console.error(`❌ Server process failed: ${error.message}`); - }); - - this.serverProcess.on('exit', (code, signal) => { - console.debug(`📋 HTTP Server process exited: code=${code}, signal=${signal}`); - }); + await this.spawnHttpServer(); } else { console.debug(`⏭️ Skipping HTTP server (JTAG_SKIP_HTTP set — widget-server handles HTTP)`); } @@ -993,6 +961,47 @@ export class SystemOrchestrator extends EventEmitter { return true; } + private async spawnHttpServer(): Promise { + const { getActiveExamplePath } = await import('../../examples/server/ExampleConfigServer'); + const activeExamplePath = getActiveExamplePath(); + const serverScript = `${activeExamplePath}/src/minimal-server.ts`; + + console.debug(`🎯 Starting HTTP server directly: ${serverScript}`); + + this.serverProcess = spawn('npx', ['tsx', serverScript], { + cwd: activeExamplePath, + stdio: ['ignore', 'pipe', 'pipe'], + shell: false + }); + + this.serverProcess.stdout?.on('data', (data) => { + console.debug(`📄 HTTP Server: ${data.toString().trim()}`); + }); + + this.serverProcess.stderr?.on('data', (data) => { + console.debug(`⚠️ HTTP Server Error: ${data.toString().trim()}`); + }); + + this.serverProcess.on('error', (error) => { + console.error(`❌ Server process failed: ${error.message}`); + }); + + this.serverProcess.on('exit', (code, signal) => { + console.debug(`📋 HTTP Server process exited: code=${code}, signal=${signal}`); + this.serverProcess = null; + if (!this.coreShuttingDown && !process.env.JTAG_SKIP_HTTP) { + console.warn(`🔁 HTTP server exited unexpectedly; restarting in 1000ms`); + setTimeout(() => { + if (!this.coreShuttingDown && !this.serverProcess) { + this.spawnHttpServer().catch(error => { + console.error(`❌ Failed to restart HTTP server: ${error instanceof Error ? error.message : String(error)}`); + }); + } + }, 1000); + } + }); + } + private async executeServerProcess(): Promise { console.debug('🔄 Server process ready...'); await milestoneEmitter.completeMilestone( diff --git a/src/system/rag/sources/CodebaseSearchSource.ts b/src/system/rag/sources/CodebaseSearchSource.ts index e8c6faa9a..3787b9c22 100644 --- a/src/system/rag/sources/CodebaseSearchSource.ts +++ b/src/system/rag/sources/CodebaseSearchSource.ts @@ -28,6 +28,24 @@ const MIN_QUERY_LENGTH = 15; /** Similarity threshold — only inject results that are genuinely relevant */ const RELEVANCE_THRESHOLD = 0.35; +/** Source-local latency budget. Code context is useful, but chat must not wait + * on a cold or oversized index. The source degrades to empty context instead + * of letting the whole persona response pipeline stall behind RAGComposer's + * broader watchdog. */ +const QUERY_TIMEOUT_MS = Number(process.env.CONTINUUM_CODEBASE_RAG_TIMEOUT_MS ?? 4_000); + +const TECHNICAL_QUERY_PATTERN = new RegExp([ + '\\b(code|codebase|repo|repository|file|files|function|class|interface|type|module|import|export)\\b', + '\\b(bug|error|exception|stack|trace|crash|failing|failure|fix|debug|compile|build)\\b', + '\\b(unit|integration|e2e|regression)\\s+tests?\\b', + '\\btests?\\s+(failed|failing|fail|red|broken|pass|passing|green)\\b', + '\\b(cargo|npm|pnpm|yarn|pytest|vitest|jest|playwright)\\s+test\\b', + '\\b(refactor|architecture|architect|implement|implementation|api|endpoint|schema|database|docker)\\b', + '\\b(rust|typescript|javascript|tsx|jsx|node|python|cargo|npm|sql|sqlite|postgres)\\b', + '`[^`]+`', + '[\\w./-]+\\.(ts|tsx|js|jsx|rs|py|toml|json|md|sql|sh|ps1)\\b', +].join('|'), 'i'); + export class CodebaseSearchSource implements RAGSource { readonly name = 'codebase-search'; readonly tier = PromptTier.VOLATILE; @@ -36,13 +54,21 @@ export class CodebaseSearchSource implements RAGSource { readonly isShared = true; isApplicable(context: RAGSourceContext): boolean { - // Always applicable if there's a substantive message. - // The persona's mind decides what context matters — we just provide the capability. - // If results aren't relevant (low cosine similarity), the query returns empty - // and costs nothing in the token budget. const currentMessage = context.options?.currentMessage?.content; if (!currentMessage || typeof currentMessage !== 'string') return false; - return currentMessage.length >= MIN_QUERY_LENGTH; + + // Recipe-owned RAG activation is authoritative. If a queue item or room + // recipe explicitly asks for codebase-search, provide it even when the + // surface text is terse ("fix this", "same bug"). + if (context.activeSources?.includes(this.name)) return true; + + if (currentMessage.trim().length < MIN_QUERY_LENGTH) return false; + + // Default chat should stay conversational. Pulling semantic code search + // for every ordinary room message turns one human prompt into N expensive + // index queries across personas and was observed to wedge chat behind a + // 30s RAG timeout. Codebase context is activated by technical intent. + return TECHNICAL_QUERY_PATTERN.test(currentMessage); } async load(context: RAGSourceContext, allocatedBudget: number): Promise> { @@ -51,7 +77,7 @@ export class CodebaseSearchSource implements RAGSource { try { const indexer = getCodebaseIndexer(); - const results = await indexer.query(query, MAX_RESULTS); + const results = await this.withQueryTimeout(indexer.query(query, MAX_RESULTS), query); // Filter by relevance — only inject results the persona would actually find useful const relevant = results.filter(r => (r.relevanceScore ?? 0) >= RELEVANCE_THRESHOLD); @@ -99,4 +125,19 @@ export class CodebaseSearchSource implements RAGSource { }; } } + + private async withQueryTimeout(queryPromise: Promise, query: string): Promise { + let timer: ReturnType | null = null; + try { + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => { + reject(new Error(`codebase search exceeded ${QUERY_TIMEOUT_MS}ms for "${query.slice(0, 40)}..."`)); + }, QUERY_TIMEOUT_MS); + timer.unref?.(); + }); + return await Promise.race([queryPromise, timeout]); + } finally { + if (timer) clearTimeout(timer); + } + } } diff --git a/src/system/rag/sources/ConversationHistorySource.ts b/src/system/rag/sources/ConversationHistorySource.ts index 7a5a43345..2b2a59257 100644 --- a/src/system/rag/sources/ConversationHistorySource.ts +++ b/src/system/rag/sources/ConversationHistorySource.ts @@ -16,6 +16,7 @@ import { ORM } from '../../../daemons/data-daemon/server/ORM'; import { ChatMessageEntity, type MediaItem } from '../../data/entities/ChatMessageEntity'; import { Events } from '../../core/shared/Events'; import { Logger } from '../../core/logging/Logger'; +import { detectConversationHistoryPoison } from './conversationHistoryPoison'; const log = Logger.create('ConversationHistorySource', 'rag'); @@ -23,61 +24,6 @@ const log = Logger.create('ConversationHistorySource', 'rag'); // Token budget is the real constraint; 100 messages is plenty for any conversation window. const DB_FETCH_LIMIT = 100; -// Patterns for detecting fabricated conversations within a single message body. -// These messages were generated by models that hallucinated entire multi-party -// conversations instead of responding as themselves. They poison LLM context -// and cause cascading failures (cloud AIs adopting "silence protocol"). -// -// Formats seen in the wild: -// "2/16/2026 2:24:03 PM Teacher AI: ..." (date + time + speaker) -// "[02:01] Teacher AI: ..." (bracketed time + speaker) -// "[03:00] Helper AI: That's a good point..." (bracketed time + speaker) -// "Gemini: I'm happy to chat..." (single-word speaker prefix) -// "Teacher AI: I think that's a great..." (multi-word speaker prefix) - -// Full date + time at line start -const FABRICATED_DATE_RE = /^\s*\d{1,4}[/-]\d{1,2}[/-]\d{1,4}\s+\d{1,2}:\d{2}\s+[A-Z]/gm; -// Bracketed time at line start: [02:01], [14:30], etc. -const FABRICATED_BRACKET_TIME_RE = /^\s*\[\d{1,2}:\d{2}\]\s+[A-Z]/gm; -// Multi-word speaker prefix: "Teacher AI:", "Helper AI:", "CodeReview AI:" -const FABRICATED_SPEAKER_RE = /^[A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*:\s+\S/gm; -// Single-word known AI speaker prefix: "Gemini:", "Groq:", "Together:", "Fireworks:" -const FABRICATED_SINGLE_SPEAKER_RE = /^(?:Gemini|Groq|Together|Fireworks|Claude|GPT|Local|Joel|Anonymous|Qwen|DeepSeek|Grok|Candle|Helper|Teacher|CodeReview):\s+\S/gm; - -/** - * Check if a message body is a fabricated multi-party conversation. - * Returns true if the message contains 3+ timestamped lines, - * 4+ multi-word speaker prefixes with 2+ distinct names, or - * 3+ single-word known AI speaker prefixes. - */ -function isFabricatedConversation(text: string): boolean { - if (!text || text.length < 60) return false; - - // Check 1: Full date+time timestamped speaker lines - const dateMatches = text.match(FABRICATED_DATE_RE); - if (dateMatches && dateMatches.length >= 3) return true; - - // Check 2: Bracketed [HH:MM] timestamped lines - const bracketMatches = text.match(FABRICATED_BRACKET_TIME_RE); - if (bracketMatches && bracketMatches.length >= 3) return true; - - // Check 3: Multi-word speaker prefixes with distinct names - const speakerMatches = text.match(FABRICATED_SPEAKER_RE); - if (speakerMatches && speakerMatches.length >= 4) { - const names = new Set(speakerMatches.map(m => m.split(':')[0].trim())); - if (names.size >= 2) return true; - } - - // Check 4: Single-word known AI speaker prefixes - const singleMatches = text.match(FABRICATED_SINGLE_SPEAKER_RE); - if (singleMatches && singleMatches.length >= 3) { - const names = new Set(singleMatches.map(m => m.split(':')[0].trim())); - if (names.size >= 2) return true; - } - - return false; -} - // ── Bare tool call detection ────────────────────────────────────── // When an AI outputs a tool call as plain text (not a proper tool_use block), // it gets saved as a chat message. Other AIs see it in history and copy the @@ -307,17 +253,26 @@ export class ConversationHistorySource implements RAGSource { // Filter out fabricated conversation messages — hallucinated multi-party // conversations that poison context and cause cascading failures. let filteredCount = 0; + let metaSummaryCount = 0; const cleanMessages = messages.filter((msg: MessageWithSender) => { const text = msg.content?.text || ''; - if (isFabricatedConversation(text)) { + const poisonReason = detectConversationHistoryPoison(text); + if (poisonReason === 'fabricated-conversation') { filteredCount++; return false; } + if (poisonReason === 'meta-summary-echo') { + metaSummaryCount++; + return false; + } return true; }); if (filteredCount > 0) { log.warn(`Filtered ${filteredCount} fabricated conversation messages from history`); } + if (metaSummaryCount > 0) { + log.warn(`Filtered ${metaSummaryCount} meta-summary echo messages from history`); + } // Sanitize bare tool call messages — replace with contextual note // so other AIs know someone attempted a tool but don't copy the broken syntax diff --git a/src/system/rag/sources/conversationHistoryPoison.ts b/src/system/rag/sources/conversationHistoryPoison.ts new file mode 100644 index 000000000..c4c4147fd --- /dev/null +++ b/src/system/rag/sources/conversationHistoryPoison.ts @@ -0,0 +1,58 @@ +// Patterns for detecting generated chat artifacts that poison future RAG turns. +// Keep this file pure: no ORM, logger, or server imports, so it can be tested +// without booting the Continuum runtime. + +// Full date + time at line start +const FABRICATED_DATE_RE = /^\s*\d{1,4}[/-]\d{1,2}[/-]\d{1,4}\s+\d{1,2}:\d{2}\s+[A-Z]/gm; +// Bracketed time at line start: [02:01], [14:30], etc. +const FABRICATED_BRACKET_TIME_RE = /^\s*\[\d{1,2}:\d{2}\]\s+[A-Z]/gm; +// Multi-word speaker prefix: "Teacher AI:", "Helper AI:", "CodeReview AI:" +const FABRICATED_SPEAKER_RE = /^[A-Z][a-zA-Z]+\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*:\s+\S/gm; +// Single-word known AI speaker prefix: "Gemini:", "Groq:", etc. +const FABRICATED_SINGLE_SPEAKER_RE = /^(?:Gemini|Groq|Together|Fireworks|Claude|GPT|Local|Joel|Anonymous|Qwen|DeepSeek|Grok|Candle|Helper|Teacher|CodeReview):\s+\S/gm; + +// Persona meta-summary pattern observed during startup smoke tests. +const META_SUMMARY_ECHO_RE = /\bI received a message from\s+[A-Z][\w -]{1,80}:\s*["“][\s\S]{10,}["”][\s\S]{0,800}\b(?:This indicates|The key pattern here|successfully acknowledged|responded to the startup smoke test)\b/i; + +export type ConversationHistoryPoisonReason = 'fabricated-conversation' | 'meta-summary-echo'; + +/** + * Check if a message body is a fabricated multi-party conversation. + * Returns true if the message contains 3+ timestamped lines, + * 4+ multi-word speaker prefixes with 2+ distinct names, or + * 3+ single-word known AI speaker prefixes. + */ +export function isFabricatedConversation(text: string): boolean { + if (!text || text.length < 60) return false; + + const dateMatches = text.match(FABRICATED_DATE_RE); + if (dateMatches && dateMatches.length >= 3) return true; + + const bracketMatches = text.match(FABRICATED_BRACKET_TIME_RE); + if (bracketMatches && bracketMatches.length >= 3) return true; + + const speakerMatches = text.match(FABRICATED_SPEAKER_RE); + if (speakerMatches && speakerMatches.length >= 4) { + const names = new Set(speakerMatches.map(m => m.split(':')[0].trim())); + if (names.size >= 2) return true; + } + + const singleMatches = text.match(FABRICATED_SINGLE_SPEAKER_RE); + if (singleMatches && singleMatches.length >= 3) { + const names = new Set(singleMatches.map(m => m.split(':')[0].trim())); + if (names.size >= 2) return true; + } + + return false; +} + +export function isMetaSummaryEcho(text: string): boolean { + if (!text || text.length < 80) return false; + return META_SUMMARY_ECHO_RE.test(text); +} + +export function detectConversationHistoryPoison(text: string): ConversationHistoryPoisonReason | null { + if (isFabricatedConversation(text)) return 'fabricated-conversation'; + if (isMetaSummaryEcho(text)) return 'meta-summary-echo'; + return null; +} diff --git a/src/system/rag/test/unit/CodebaseSearchSource.test.ts b/src/system/rag/test/unit/CodebaseSearchSource.test.ts new file mode 100644 index 000000000..798c12da2 --- /dev/null +++ b/src/system/rag/test/unit/CodebaseSearchSource.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from 'vitest'; +import { CodebaseSearchSource } from '../../sources/CodebaseSearchSource'; +import type { RAGSourceContext } from '../../shared/RAGSource'; + +function contextFor(message: string, activeSources?: readonly string[]): RAGSourceContext { + return { + personaId: 'persona-1' as any, + roomId: 'room-1' as any, + options: { + currentMessage: { + role: 'user', + content: message, + name: 'Developer', + timestamp: Date.now(), + }, + modelId: 'continuum-ai/qwen3.5-4b-code-forged-GGUF', + provider: 'local', + maxTokens: 256, + contextWindow: 8192, + tokensPerSecond: 15, + }, + totalBudget: 4096, + provider: 'local', + activeSources, + }; +} + +describe('CodebaseSearchSource activation', () => { + it('does not run codebase search for ordinary chat', () => { + const source = new CodebaseSearchSource(); + + expect(source.isApplicable(contextFor('Personas: reply with your name and confirm you can see this message.'))).toBe(false); + expect(source.isApplicable(contextFor('Teacher AI: Yes, I can confirm seeing this startup smoke test in the General room.'))).toBe(false); + expect(source.isApplicable(contextFor('tacos, tell me all you know'))).toBe(false); + }); + + it('runs for technical/code intent', () => { + const source = new CodebaseSearchSource(); + + expect(source.isApplicable(contextFor('Why does ChatRAGBuilder time out on codebase-search?'))).toBe(true); + expect(source.isApplicable(contextFor('Fix workers/continuum-core/src/model_registry/artifacts.rs'))).toBe(true); + expect(source.isApplicable(contextFor('The docker build is failing with a Rust compile error.'))).toBe(true); + expect(source.isApplicable(contextFor('The integration tests are failing after the Docker refactor.'))).toBe(true); + }); + + it('honors explicit recipe source activation', () => { + const source = new CodebaseSearchSource(); + + expect(source.isApplicable(contextFor('fix this', ['codebase-search']))).toBe(true); + }); +}); diff --git a/src/system/rag/test/unit/ConversationHistorySource.test.ts b/src/system/rag/test/unit/ConversationHistorySource.test.ts new file mode 100644 index 000000000..8781906fe --- /dev/null +++ b/src/system/rag/test/unit/ConversationHistorySource.test.ts @@ -0,0 +1,27 @@ +import { describe, expect, it } from 'vitest'; +import { detectConversationHistoryPoison } from '../../sources/conversationHistoryPoison'; + +describe('ConversationHistorySource context poison detection', () => { + it('filters persona meta-summary echoes from future RAG context', () => { + const poisoned = 'I received a message from Helper AI: "Teacher AI: Yes, I can confirm seeing this startup smoke test in the General room." This indicates that Teacher AI successfully acknowledged and responded to the startup smoke test message as expected. The key pattern here is the successful completion of a multi-step communication sequence.'; + + expect(detectConversationHistoryPoison(poisoned)).toBe('meta-summary-echo'); + }); + + it('keeps ordinary user and persona messages', () => { + expect(detectConversationHistoryPoison('tacos, tell me all you know')).toBeNull(); + expect(detectConversationHistoryPoison('Helper AI: I can see this startup smoke test in the General room.')).toBeNull(); + expect(detectConversationHistoryPoison('I received your startup smoke test and can respond as Helper AI.')).toBeNull(); + }); + + it('still filters fabricated multi-speaker transcripts', () => { + const fabricated = [ + 'Teacher AI: I think we should test the room.', + 'Helper AI: Agreed, I can see the room.', + 'Teacher AI: Please confirm the model route.', + 'Helper AI: Confirmed, routing is local.' + ].join('\n'); + + expect(detectConversationHistoryPoison(fabricated)).toBe('fabricated-conversation'); + }); +}); diff --git a/src/system/secrets/SecretManager.ts b/src/system/secrets/SecretManager.ts index 7bab67603..a7cdc948d 100644 --- a/src/system/secrets/SecretManager.ts +++ b/src/system/secrets/SecretManager.ts @@ -141,9 +141,11 @@ export class SecretManager { * @param requestedBy - Who is requesting (for audit trail) */ get(key: string, requestedBy = 'unknown'): string | undefined { + this.ensureInitialized(); this.logAccess(key, requestedBy); - return this.secrets.get(key); + const value = this.secrets.get(key); + return value && value.trim().length > 0 ? value : undefined; } /** @@ -169,7 +171,7 @@ export class SecretManager { * Check if secret exists */ has(key: string): boolean { - return this.secrets.has(key); + return this.get(key, 'SecretManager.has') !== undefined; } /** @@ -179,7 +181,7 @@ export class SecretManager { * Returns defaultValue if key not found */ getBoolean(key: string, defaultValue = false): boolean { - const value = this.secrets.get(key); + const value = this.get(key, 'SecretManager.getBoolean'); if (value === undefined) { return defaultValue; } @@ -192,7 +194,7 @@ export class SecretManager { * Returns defaultValue if key not found or not a valid number */ getNumber(key: string, defaultValue = 0): number { - const value = this.secrets.get(key); + const value = this.get(key, 'SecretManager.getNumber'); if (value === undefined) { return defaultValue; } @@ -205,7 +207,10 @@ export class SecretManager { * Safe to expose to browser for UI rendering */ getAvailableKeys(): string[] { - return Array.from(this.secrets.keys()); + this.ensureInitialized(); + return Array.from(this.secrets.entries()) + .filter(([, value]) => value.trim().length > 0) + .map(([key]) => key); } /** @@ -213,10 +218,11 @@ export class SecretManager { * IMPORTANT: Only call this from secure server-side code! */ async set(key: string, value: string): Promise { - this.secrets.set(key, value); + const normalizedValue = this.normalizeEnvValue(value); + this.secrets.set(key, normalizedValue); // Persist to ~/.continuum/config.env - await this.persistToHomeConfig(key, value); + await this.persistToHomeConfig(key, normalizedValue); console.log(`🔐 SecretManager: Set ${key} (redacted)`); } @@ -238,6 +244,7 @@ export class SecretManager { * Replaces actual keys with [REDACTED-xxx] */ redact(text: string): string { + this.ensureInitialized(); let redacted = text; for (const [key, value] of this.secrets) { @@ -262,6 +269,12 @@ export class SecretManager { // Private Methods // ======================== + private ensureInitialized(): void { + if (!this.isInitialized) { + this.initializeSync(); + } + } + /** * Load from ~/.continuum/config.env */ @@ -319,8 +332,9 @@ export class SecretManager { const secretPattern = /^[A-Z_]+_(API_KEY|KEY|API_SECRET|SECRET|TOKEN|URL)$/; for (const [key, value] of Object.entries(process.env)) { - if (secretPattern.test(key) && value) { - this.secrets.set(key, value); + const normalizedValue = this.normalizeEnvValue(value ?? ''); + if (secretPattern.test(key) && normalizedValue.length > 0) { + this.secrets.set(key, normalizedValue); } } } @@ -387,25 +401,37 @@ export class SecretManager { const [, key, rawValue] = match; // Expand tilde (~) to home directory - let value = rawValue.trim(); + let value = this.normalizeEnvValue(rawValue); if (value.startsWith('~/')) { value = path.join(os.homedir(), value.slice(2)); } - // Store in secrets Map - this.secrets.set(key, value); + // Empty placeholders document available config keys but must not erase + // a real value already supplied by the shell, Docker, or a higher + // priority config source. + if (value.length > 0 || !this.secrets.has(key)) { + this.secrets.set(key, value); + } // Mirror all config.env values to process.env so they're visible to // subprocesses (jtag CLI, seed scripts) and commands that check process.env // (persona/allocate checks API keys). Don't overwrite env vars already set // by Docker compose or the shell — orchestrator env takes precedence. - if (!process.env[key]) { + if (value.length > 0 && !process.env[key]) { process.env[key] = value; } } } } + private normalizeEnvValue(rawValue: string): string { + let value = rawValue.trim(); + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + return value.trim(); + } + /** * Persist secret to ~/.continuum/config.env */ diff --git a/src/system/shared/Constants.ts b/src/system/shared/Constants.ts index 3274ee01e..60a7cc76e 100644 --- a/src/system/shared/Constants.ts +++ b/src/system/shared/Constants.ts @@ -131,10 +131,10 @@ export const MODEL_IDS = { GROK_4: 'grok-4' }, - /** Candle local models (use LOCAL_MODELS for new code) */ + /** Historical local aliases. Do not use for Continuum runtime selection. */ CANDLE: { - LLAMA_3_2_3B: 'llama3.2:3b', - LLAMA_3_1_8B: 'llama3.1:8b' + QWEN_GATING: 'Qwen/Qwen2-0.5B-Instruct', + QWEN_DEFAULT: 'continuum-ai/qwen3.5-4b-code-forged-GGUF' }, /** Sentinel local models */ @@ -147,16 +147,13 @@ export const MODEL_IDS = { /** * LOCAL_MODELS - SINGLE SOURCE OF TRUTH for local inference * - * ⚠️ CRITICAL: This is the canonical model configuration for Candle (native Rust) inference + * ⚠️ CRITICAL: This is the canonical model configuration for native Rust inference * ⚠️ All model mappings, preloads, and defaults come from here - * ⚠️ CandleAdapter reads from here - DO NOT duplicate mappings elsewhere + * ⚠️ Local runtime/admission reads from here - DO NOT duplicate mappings elsewhere * - * Candle is the ONLY local inference path. - * The model name mappings below exist for backward compatibility with - * configs that reference legacy short names like 'llama3.2:3b'. - * - * Note: Using unsloth/ mirrors for Llama models (no HuggingFace access approval needed) - * For meta-llama/ originals: accept license at https://huggingface.co/meta-llama + * Local alpha models are Qwen: Qwen3.5 for text/code and Qwen2-VL for vision. + * Runtime selection is Rust-owned so VRAM/unified-memory pressure, LoRA paging, + * and future MoE/base-model paging stay under one scheduler. */ export const LOCAL_MODELS = { /** Default models for inference worker to preload at startup */ @@ -190,61 +187,15 @@ export const LOCAL_MODELS = { /** BF16 batch-prefill variant — explicitly selects the safetensors backend (32GB+ only) */ CODING_AGENT_BF16: 'coder-bf16', - /** Map legacy model names → HuggingFace model IDs (legacy naming style kept for backward compat) */ + /** Explicit local aliases accepted by local model adapters. */ LEGACY_TO_HUGGINGFACE: { - // Llama 3.2 family — uses unsloth mirror (no HF approval needed) - 'llama3.2:3b': 'unsloth/Llama-3.2-3B-Instruct', - 'llama3.2:1b': 'Qwen/Qwen2-0.5B-Instruct', // Keep 1B small for gating - 'llama3.2-3b': 'unsloth/Llama-3.2-3B-Instruct', - 'llama3.2-1b': 'Qwen/Qwen2-0.5B-Instruct', - - // Llama 3.1 family - 'llama3.1:8b': 'unsloth/Llama-3.1-8B-Instruct', - 'llama3.1:70b': 'meta-llama/Llama-3.1-70B-Instruct', - - // Phi family (Microsoft, no approval needed) - 'phi3:mini': 'microsoft/Phi-3-mini-4k-instruct', - 'phi3:small': 'microsoft/Phi-3-small-8k-instruct', - 'phi3:medium': 'microsoft/Phi-3-medium-4k-instruct', - 'phi:2': 'microsoft/phi-2', - 'phi3': 'microsoft/Phi-3-mini-4k-instruct', - - // Mistral family (no approval needed) - 'mistral:7b': 'mistralai/Mistral-7B-Instruct-v0.2', - 'mistral:7b-v0.3': 'mistralai/Mistral-7B-Instruct-v0.3', - 'mixtral:8x7b': 'mistralai/Mixtral-8x7B-Instruct-v0.1', - 'mistral': 'mistralai/Mistral-7B-Instruct-v0.2', - - // Qwen family (no approval needed - recommended!) + 'qwen3.5': 'continuum-ai/qwen3.5-4b-code-forged-GGUF', + 'qwen3.5:4b': 'continuum-ai/qwen3.5-4b-code-forged-GGUF', + 'qwen3.5-code': 'continuum-ai/qwen3.5-4b-code-forged-GGUF', + 'qwen2-vl': 'qwen2-vl-7b-instruct', 'qwen2:0.5b': 'Qwen/Qwen2-0.5B-Instruct', - 'qwen2:1.5b': 'Qwen/Qwen2-1.5B-Instruct', - 'qwen2:7b': 'Qwen/Qwen2-7B-Instruct', - 'qwen2.5:7b': 'Qwen/Qwen2.5-7B-Instruct', - 'qwen2.5:3b': 'Qwen/Qwen2.5-3B-Instruct', 'qwen2': 'Qwen/Qwen2-0.5B-Instruct', - // Gemma family (Google, no approval needed) - 'gemma:2b': 'google/gemma-2b-it', - 'gemma:7b': 'google/gemma-7b-it', - 'gemma2:2b': 'google/gemma-2-2b-it', - 'gemma2:9b': 'google/gemma-2-9b-it', - - // StarCoder family - 'starcoder2:3b': 'bigcode/starcoder2-3b', - 'starcoder2:7b': 'bigcode/starcoder2-7b', - - // TinyLlama (good for testing) - 'tinyllama': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', - 'tinyllama:1.1b': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', - - // SmolLM2 family (HuggingFace, good for fast testing) - 'smollm2:135m': 'HuggingFaceTB/SmolLM2-135M-Instruct', - 'smollm2:360m': 'HuggingFaceTB/SmolLM2-360M-Instruct', - 'smollm2:1.7b': 'HuggingFaceTB/SmolLM2-1.7B-Instruct', - - // Bare family aliases (resolve to default variant) - 'llama3.2': 'unsloth/Llama-3.2-3B-Instruct', - 'llama3.1': 'unsloth/Llama-3.1-8B-Instruct', 'qwen2.5': 'Qwen/Qwen2.5-7B-Instruct', } as const, @@ -261,7 +212,7 @@ export const LOCAL_MODELS = { return mapping[normalized]; } - // Try without version suffix (e.g., 'llama3.2:3b-instruct' -> 'llama3.2:3b') + // Try without version suffix (e.g., 'qwen3.5:4b-instruct' -> 'qwen3.5:4b') const withoutSuffix = normalized.replace(/-instruct.*$|-chat.*$|-q\d+.*$/i, ''); if (mapping[withoutSuffix]) { return mapping[withoutSuffix]; diff --git a/src/system/shared/ModelCapabilities.ts b/src/system/shared/ModelCapabilities.ts index 917a8a494..5d2eea7a4 100644 --- a/src/system/shared/ModelCapabilities.ts +++ b/src/system/shared/ModelCapabilities.ts @@ -14,8 +14,8 @@ * Usage: * // At adapter discovery time: * registry.register({ - * modelId: 'meta-llama/Llama-3.1-8B-Instruct', - * provider: 'candle', + * modelId: 'qwen3.5-4b-code-forged', + * provider: 'local', * contextWindow: 1400, * capabilities: { ... }, * adapterProfile: { @@ -27,7 +27,7 @@ * }); * * // At selection time: - * const candidates = registry.getAll('meta-llama/Llama-3.1-8B-Instruct') + * const candidates = registry.getAll('qwen3.5-4b-code-forged') * .filter(m => m.adapterProfile?.fineTuning.supportedMethods.includes(AdapterMethod.QLORA)) * .filter(m => (m.adapterProfile?.hardware.inferenceVramMB ?? Infinity) <= availableVram); */ @@ -274,7 +274,7 @@ export interface FineTuningProfile { * Each runtime has different capabilities for loading models and adapters. */ export enum InferenceRuntime { - /** Candle — Rust-native, GGUF/SafeTensors, Metal acceleration */ + /** Candle — training/auxiliary Rust backend, not default persona chat */ CANDLE = 'candle', /** llama.cpp — C++, GGUF, Metal/CUDA/CPU, mature ecosystem */ diff --git a/src/system/shared/ModelRegistry.ts b/src/system/shared/ModelRegistry.ts index 4d066c518..8a75cf575 100644 --- a/src/system/shared/ModelRegistry.ts +++ b/src/system/shared/ModelRegistry.ts @@ -16,13 +16,13 @@ * * Provider-scoped keys: * Internal map key is `${provider}:${modelId}` to prevent last-writer-wins - * collisions when the same model exists on multiple providers (e.g., - * meta-llama/Llama-3.1-8B-Instruct on Candle at 1400 tokens AND Together at 131072). + * collisions when the same model family exists on multiple providers with + * different context windows. * * Usage: * const registry = ModelRegistry.sharedInstance(); * const ctx = registry.contextWindow('claude-sonnet-4-5-20250929'); // any provider - * const ctx = registry.contextWindow('meta-llama/Llama-3.1-8B-Instruct', 'candle'); // specific provider + * const ctx = registry.contextWindow('qwen3.5-4b-code-forged', 'local'); // specific provider * * Future direction — Hardware-Matched Model Selection: * ModelRegistry is designed to evolve into a queryable adapter catalog where @@ -37,7 +37,7 @@ * * 3. Selection query: "give me the best model for this recipe on this hardware" * - Filters by capability, ranks by speed/quality/cost tradeoff - * - Works across local (Candle) and cloud (REST APIs) uniformly + * - Works across local runtime and cloud providers uniformly * * 4. Users with varied hardware (M1 vs RTX 4090 vs cloud-only) get automatically * matched to the best available model without manual configuration. diff --git a/src/system/user/server/PersonaLifecycleManager.ts b/src/system/user/server/PersonaLifecycleManager.ts index 1e4c2e213..16e35f336 100644 --- a/src/system/user/server/PersonaLifecycleManager.ts +++ b/src/system/user/server/PersonaLifecycleManager.ts @@ -195,7 +195,7 @@ export class PersonaLifecycleManager { * providers maintain their own warm state via API connection pooling. */ private isLocalProvider(provider: string): boolean { - return provider === 'local' || provider === 'candle' || provider === 'sentinel'; + return provider === 'local' || provider === 'sentinel'; } /** diff --git a/src/system/user/server/PersonaUser.ts b/src/system/user/server/PersonaUser.ts index d8f8073d9..9eb665c01 100644 --- a/src/system/user/server/PersonaUser.ts +++ b/src/system/user/server/PersonaUser.ts @@ -111,6 +111,7 @@ import { PersonaMessageEvaluator } from './modules/PersonaMessageEvaluator'; import { PersonaMessageGate } from './modules/PersonaMessageGate'; import { PersonaTaskTracker } from './modules/PersonaTaskTracker'; import { PersonaGenomeManager } from './modules/PersonaGenomeManager'; +import { SecretManager } from '../../secrets/SecretManager'; import { type PersonaMediaConfig, DEFAULT_MEDIA_CONFIG } from './modules/PersonaMediaConfig'; import type { CreateSessionParams, CreateSessionResult } from '../../../daemons/session-daemon/shared/SessionTypes'; import { Hippocampus } from './modules/cognitive/memory/Hippocampus'; @@ -123,6 +124,18 @@ import { PrefrontalCortex, type PersonaUserForPrefrontal } from './modules/being import { MotorCortex, type PersonaUserForMotorCortex } from './modules/being/MotorCortex'; import { RustCognitionBridge, type PersonaUserForRustCognition } from './modules/RustCognitionBridge'; import { SystemPaths } from '../../core/config/SystemPaths'; + +const PROVIDER_KEY_ENV: Record = { + anthropic: 'ANTHROPIC_API_KEY', + openai: 'OPENAI_API_KEY', + deepseek: 'DEEPSEEK_API_KEY', + groq: 'GROQ_API_KEY', + xai: 'XAI_API_KEY', + together: 'TOGETHER_API_KEY', + fireworks: 'FIREWORKS_API_KEY', + google: 'GOOGLE_API_KEY', + alibaba: 'DASHSCOPE_API_KEY', +}; import { UnifiedConsciousness } from './modules/consciousness/UnifiedConsciousness'; import { registerConsciousness, unregisterConsciousness } from '../../rag/sources/GlobalAwarenessSource'; import { Workspace } from '../../code/server/Workspace'; @@ -645,12 +658,8 @@ export class PersonaUser extends AIUser { this.log.info(`🔧 ${this.displayName}: Initialized inbox, personaState, memory (genome + RAG), trainingAccumulator, toolExecutor, responseGenerator, messageEvaluator, autonomousLoop, and cognition system (workingMemory, selfState, planFormulator)`); // Initialize worker thread for this persona - // Worker uses fast small model for gating decisions (should-respond check). - // 'local' routes through the same adapter registry as chat — DMR when - // available (Metal-fast on Mac, ~50 tok/s), Candle fallback when not. - // Previously hardcoded to 'candle' which forced CPU gating on ALL - // personas even when DMR+Metal was available — the gating bottleneck - // blocked the fast Metal response path. + // Worker is a model-free fallback for should-respond checks. The normal + // gate is Rust fullEvaluate; local chat inference is llama.cpp/Qwen. this.worker = new PersonaWorkerThread(this.id, { providerType: 'local', providerConfig: { @@ -805,7 +814,7 @@ export class PersonaUser extends AIUser { const adapters = this.memory!.genome.getAllAdapters().map(a => ({ name: a.getName(), domain: a.getDomain(), - ollama_model_name: a.getTrainedModelName() ?? undefined, + trained_model_name: a.getTrainedModelName() ?? undefined, is_loaded: a.isLoaded(), is_current: a === this.memory!.genome.getCurrentAdapter(), priority: a.getPriority(), @@ -1147,12 +1156,13 @@ export class PersonaUser extends AIUser { // Daemon is ready, wire the genome try { - // Try to get CandleAdapter (native Rust inference with LoRA support) + // Training/LoRA composition still uses the Candle adapter. Runtime chat + // inference does not. const candleAdapter = AIProviderDaemon.getAdapter('candle'); - this.logger.enqueueLog('cognition.log', `🧬 wireGenomeToProvider — candleAdapter=${candleAdapter ? 'found' : 'null'}, provider=${this.modelConfig.provider}`); + this.logger.enqueueLog('cognition.log', `🧬 wireGenomeToProvider — trainingAdapter=${candleAdapter ? 'found' : 'null'}, provider=${this.modelConfig.provider}`); if (candleAdapter) { this.memory.genome.setAIProvider(candleAdapter); - this.logger.enqueueLog('cognition.log', `🧬 Genome wired to CandleAdapter (LoRA composition enabled)`); + this.logger.enqueueLog('cognition.log', `🧬 Genome wired to training adapter (LoRA composition enabled)`); } else { this.log.warn(`⚠️ ${this.displayName}: No Candle adapter available for genome`); } @@ -1389,6 +1399,11 @@ export class PersonaUser extends AIUser { return; } + if (!this.isProviderAvailableForChat()) { + this.log.debug(`⏭️ ${this.displayName}: Skipping chat (provider ${this.modelConfig.provider} is not configured)`); + return; + } + // STEP 2: Deduplication - prevent evaluating same message multiple times // Uses TS-local Set (not Rust DashSet) because CognitionEngine.evaluated_messages // serves a different purpose (fast_path_decision pipeline dedup). Merging them @@ -1693,6 +1708,11 @@ export class PersonaUser extends AIUser { preBuiltRagContext?: PipelineRAGContext, socialSignals?: import('../../../shared/generated').SocialSignals ): Promise { + if (!this.isProviderAvailableForChat()) { + this.log.warn(`⏭️ ${this.displayName}: Refusing response generation because provider ${this.modelConfig.provider} is not configured`); + return; + } + // Check dormancy state before responding const shouldRespond = this.responseGenerator.shouldRespondToMessage( originalMessage, @@ -1712,6 +1732,21 @@ export class PersonaUser extends AIUser { } } + private isProviderAvailableForChat(): boolean { + const provider = this.modelConfig.provider; + if (provider === 'local' || provider === 'sentinel') { + return true; + } + + const keyEnv = PROVIDER_KEY_ENV[provider]; + if (!keyEnv) { + return true; + } + + const secretValue = SecretManager.getInstance().get(keyEnv, 'PersonaUser'); + return Boolean(secretValue); + } + /** * Generate text using this persona's LLM * diff --git a/src/system/user/server/modules/PersonaGenome.ts b/src/system/user/server/modules/PersonaGenome.ts index 53227c649..b10a9d5ed 100644 --- a/src/system/user/server/modules/PersonaGenome.ts +++ b/src/system/user/server/modules/PersonaGenome.ts @@ -536,7 +536,8 @@ export class PersonaGenome { * Get active adapters in format suitable for TextGenerationRequest * * This is the bridge between PersonaGenome and the AI provider system. - * Returns adapter info that CandleAdapter can use to load/apply LoRA weights. + * Returns adapter info that the active training/runtime adapter can use to + * load or apply LoRA weights. */ getActiveAdaptersForRequest(): Array<{ name: string; path: string; domain: string; scale: number }> { const result: Array<{ name: string; path: string; domain: string; scale: number }> = []; diff --git a/src/system/user/server/modules/PersonaTaskExecutor.ts b/src/system/user/server/modules/PersonaTaskExecutor.ts index 90e6611b8..b2e2ac000 100644 --- a/src/system/user/server/modules/PersonaTaskExecutor.ts +++ b/src/system/user/server/modules/PersonaTaskExecutor.ts @@ -586,7 +586,7 @@ export class PersonaTaskExecutor { this.log(`🧬 ${this.displayName}: Collected ${trainingData.examples.length} training examples`); // 3. Build training request - const baseModel = this.memory.genome.getState().baseModel || 'llama3.2:3b'; + const baseModel = this.memory.genome.getState().baseModel || 'continuum-ai/qwen3.5-4b-code-forged-GGUF'; const trainingRequest: LoRATrainingRequest = { personaId: this.personaId, personaName: this.displayName, diff --git a/src/system/user/server/modules/ProgressiveScorer.ts b/src/system/user/server/modules/ProgressiveScorer.ts index 2c03fcf66..750a0685b 100644 --- a/src/system/user/server/modules/ProgressiveScorer.ts +++ b/src/system/user/server/modules/ProgressiveScorer.ts @@ -12,8 +12,9 @@ * **Purpose**: Enable mid-stream model upgrades when lower-tier models show signs * of struggling, maintaining cost-efficiency while preserving quality. * - * **Core Concept**: Start cheap/free (qwen2.5:7b), detect complexity as generating, - * upgrade only when needed (llama3.1:70b → deepseek-chat → claude-3-5-sonnet). + * **Core Concept**: Start with the cheapest local-capable model selected by + * the Rust registry/admission layer, detect complexity as generating, and + * upgrade only when a richer local/cloud capability is explicitly available. * * **Integration**: Used by AIProviderDaemon streaming wrapper (Phase 2B) * diff --git a/src/system/user/server/modules/cognition/PeerReviewTypes.ts b/src/system/user/server/modules/cognition/PeerReviewTypes.ts index d11e14999..f92f308ea 100644 --- a/src/system/user/server/modules/cognition/PeerReviewTypes.ts +++ b/src/system/user/server/modules/cognition/PeerReviewTypes.ts @@ -324,9 +324,9 @@ export const MODEL_INTELLIGENCE_WEIGHTS: Record = { 'xai:grok-4': 0.85, 'xai:grok-3': 0.8, // Updated from grok-beta (deprecated 2025-09-15) - // Candle (local models) - 'candle:llama3.2:3b': 0.3, - 'candle:llama3.1:8b': 0.5, + // Local models + 'local:continuum-ai/qwen3.5-4b-code-forged-GGUF': 0.55, + 'local:Qwen/Qwen2-0.5B-Instruct': 0.2, // Sentinel (local pre-trained) 'sentinel:gpt2': 0.2, diff --git a/src/system/user/server/modules/cognition/adapters/LLMAdapter.ts b/src/system/user/server/modules/cognition/adapters/LLMAdapter.ts index 69a1bb836..984c7b9a1 100644 --- a/src/system/user/server/modules/cognition/adapters/LLMAdapter.ts +++ b/src/system/user/server/modules/cognition/adapters/LLMAdapter.ts @@ -72,12 +72,12 @@ export class LLMAdapter implements IDecisionAdapter { // Map gating model mode to actual model name // 'deterministic' = skip LLM, use simple heuristics - // 'small' = fast model (llama3.2:1b) - // 'full' = accurate model (llama3.2:3b) + // 'small' = fast local gating model + // 'full' = active persona model const gatingModelMap: Record = { 'deterministic': null, // Skip LLM gating - 'small': 'llama3.2:1b', // Fast (~150-200ms) - 'full': 'llama3.2:3b' // Accurate (~400-500ms) + 'small': 'Qwen/Qwen2-0.5B-Instruct', + 'full': context.modelId ?? 'continuum-ai/qwen3.5-4b-code-forged-GGUF' }; // Default to 'deterministic' to avoid queue contention with main generation diff --git a/src/system/user/server/tests/integration/PersonaUser-Lifecycle.test.ts b/src/system/user/server/tests/integration/PersonaUser-Lifecycle.test.ts index 5219cd1ba..8158e2b68 100644 --- a/src/system/user/server/tests/integration/PersonaUser-Lifecycle.test.ts +++ b/src/system/user/server/tests/integration/PersonaUser-Lifecycle.test.ts @@ -30,8 +30,8 @@ describe('PersonaUser Lifecycle (Baseline)', () => { displayName: 'Test Persona (Baseline)', type: 'persona', modelConfig: { - provider: 'candle', - model: 'llama3.2', + provider: 'local', + model: 'continuum-ai/qwen3.5-4b-code-forged-GGUF', capabilities: ['text'] }, capabilities: ['text'], diff --git a/src/workers/continuum-core/config/models.toml b/src/workers/continuum-core/config/models.toml index 072bf0b25..8b4789684 100644 --- a/src/workers/continuum-core/config/models.toml +++ b/src/workers/continuum-core/config/models.toml @@ -236,12 +236,6 @@ capabilities = ["text-generation", "chat", "tool-use", "streaming"] cost_input_per_1k = 0.0 cost_output_per_1k = 0.0 gguf_hint = "huggingface.co/continuum-ai/qwen3.5-4b-code-forged-gguf" -# Where the in-process Metal/CUDA path loads the GGUF from. This is the -# artifact DMR caches under its content-addressed bundle store — same -# bytes the `docker model run` path serves. The SHA is stable (it's the -# published artifact hash), so pinning it here is correct; a newer -# forge would publish a new id, not mutate this one. -gguf_local_path = "~/.docker/models/bundles/sha256/0ed44d4643b05eba23a4ec765aeee8c0f818f9063b09e54d30ded513287f18e9/model/model.gguf" # Explicit qwen3.5 chatml template. The forged GGUF doesn't embed # `tokenizer.chat_template` in its metadata, and llama.cpp's built-in # chatml default drifts from qwen3.5's training on boundary tokens diff --git a/src/workers/continuum-core/config/providers.toml b/src/workers/continuum-core/config/providers.toml index 0c1106d53..baa631081 100644 --- a/src/workers/continuum-core/config/providers.toml +++ b/src/workers/continuum-core/config/providers.toml @@ -89,7 +89,7 @@ name = "Docker Model Runner (local Metal/CUDA)" # silently killing persona chat. Pinning to 127.0.0.1 bypasses the dual- # stack resolution entirely. base_url = "http://127.0.0.1:12434/engines/llama.cpp" -default_model = "docker.io/ai/qwen2.5:7B-Q4_K_M" +default_model = "huggingface.co/continuum-ai/qwen3.5-4b-code-forged-gguf:latest" auth = "none" # Dynamic catalog — provider lists models via /v1/models at init. # No model_prefixes — supports_model consults the live catalog, not static prefixes. diff --git a/src/workers/continuum-core/src/ai/adapter.rs b/src/workers/continuum-core/src/ai/adapter.rs index 2413801af..c34c17ec7 100644 --- a/src/workers/continuum-core/src/ai/adapter.rs +++ b/src/workers/continuum-core/src/ai/adapter.rs @@ -305,7 +305,7 @@ impl AdapterRegistry { /// Register an adapter with a priority (lower = higher priority) pub fn register(&mut self, adapter: Box, priority: usize) { - let id = adapter.provider_id().to_string(); + let id = self.registration_key(adapter.provider_id()); // Insert into priority order if priority >= self.priority_order.len() { @@ -317,6 +317,20 @@ impl AdapterRegistry { self.adapters.insert(id, adapter); } + fn registration_key(&self, provider_id: &str) -> String { + if !self.adapters.contains_key(provider_id) { + return provider_id.to_string(); + } + let mut i = 2; + loop { + let candidate = format!("{provider_id}#{i}"); + if !self.adapters.contains_key(&candidate) { + return candidate; + } + i += 1; + } + } + /// Drop an adapter from the registry. Mirror of `register`. The /// hot-swap lever for adapters whose health is dynamic (e.g. DMR /// when Docker Desktop crashes — see `DmrWatchdog`). Returns true @@ -327,9 +341,23 @@ impl AdapterRegistry { /// if there's per-adapter cleanup to do; this method drops the /// boxed adapter (Drop impl runs). pub fn deregister(&mut self, provider_id: &str) -> bool { - let removed = self.adapters.remove(provider_id).is_some(); + let keys: Vec = self + .adapters + .iter() + .filter_map(|(key, adapter)| { + if key == provider_id || adapter.provider_id() == provider_id { + Some(key.clone()) + } else { + None + } + }) + .collect(); + let removed = !keys.is_empty(); if removed { - self.priority_order.retain(|id| id != provider_id); + for key in &keys { + self.adapters.remove(key); + } + self.priority_order.retain(|id| !keys.contains(id)); } removed } @@ -338,17 +366,38 @@ impl AdapterRegistry { /// HashMap lookup. Used by health-watchdogs to decide whether they /// need to register or deregister on a probe state change. pub fn is_registered(&self, provider_id: &str) -> bool { - self.adapters.contains_key(provider_id) + self.adapters + .iter() + .any(|(key, adapter)| key == provider_id || adapter.provider_id() == provider_id) } /// Get adapter by provider ID pub fn get(&self, provider_id: &str) -> Option<&dyn AIProviderAdapter> { - self.adapters.get(provider_id).map(|b| b.as_ref()) + self.adapters + .get(provider_id) + .map(|b| b.as_ref()) + .or_else(|| { + self.priority_order.iter().find_map(|key| { + self.adapters + .get(key) + .filter(|adapter| adapter.provider_id() == provider_id) + .map(|b| b.as_ref()) + }) + }) } /// Get mutable adapter by provider ID pub fn get_mut(&mut self, provider_id: &str) -> Option<&mut Box> { - self.adapters.get_mut(provider_id) + if self.adapters.contains_key(provider_id) { + return self.adapters.get_mut(provider_id); + } + let key = self.priority_order.iter().find_map(|key| { + self.adapters + .get(key) + .filter(|adapter| adapter.provider_id() == provider_id) + .map(|_| key.clone()) + })?; + self.adapters.get_mut(&key) } /// Get available adapters (those that initialized successfully) @@ -386,9 +435,13 @@ impl AdapterRegistry { // hard-error when neither can serve the model. if let Some(pref) = preferred_provider { if pref != "local" { - for (id, adapter) in self.adapters.iter() { - if id == pref { - return Some((id.as_str(), adapter.as_ref())); + for key in &self.priority_order { + if let Some(adapter) = self.adapters.get(key) { + if key == pref || adapter.provider_id() == pref { + if model.map_or(true, |m| adapter.supports_model(m)) { + return Some((adapter.provider_id(), adapter.as_ref())); + } + } } } clog_warn!( @@ -423,8 +476,8 @@ impl AdapterRegistry { None }; if let Some(provider_id) = cloud_match { - if let Some(adapter) = self.adapters.get(provider_id) { - return Some((provider_id, adapter.as_ref())); + if let Some(adapter) = self.get(provider_id) { + return Some((provider_id, adapter)); } } } @@ -449,7 +502,7 @@ impl AdapterRegistry { // If model specified, adapter must honestly support it. // If no model specified, any adapter on the right device works. if model.map_or(true, |m| adapter.supports_model(m)) { - return Some((id.as_str(), adapter.as_ref())); + return Some((adapter.provider_id(), adapter.as_ref())); } } } @@ -519,6 +572,7 @@ mod tests { /// inference — every operation either no-ops or returns a stub. struct StubAdapter { id: String, + model: Option, } #[async_trait] @@ -567,12 +621,24 @@ mod tests { InferenceDevice::Gpu } fn supports_model(&self, _model: &str) -> bool { - true + self.model + .as_deref() + .map_or(true, |model| model == _model) } } fn stub(id: &str) -> Box { - Box::new(StubAdapter { id: id.to_string() }) + Box::new(StubAdapter { + id: id.to_string(), + model: None, + }) + } + + fn stub_model(id: &str, model: &str) -> Box { + Box::new(StubAdapter { + id: id.to_string(), + model: Some(model.to_string()), + }) } #[test] @@ -618,4 +684,27 @@ mod tests { // Final cycle leaves it unregistered. assert_eq!(r.available().len(), 0); } + + #[test] + fn duplicate_provider_ids_remain_independently_selectable_by_model() { + let mut r = AdapterRegistry::new(); + r.register(stub_model("llamacpp-local", "qwen3.5"), 0); + r.register(stub_model("llamacpp-local", "qwen2-vl"), 0); + + assert_eq!(r.available().len(), 2); + assert!(r.is_registered("llamacpp-local")); + + let (_, qwen35) = r + .select(Some("local"), Some("qwen3.5"), InferenceDevice::Gpu) + .expect("qwen3.5 adapter selected"); + assert_eq!(qwen35.default_model(), "stub"); + assert!(qwen35.supports_model("qwen3.5")); + assert!(!qwen35.supports_model("qwen2-vl")); + + let (_, qwen2) = r + .select(Some("local"), Some("qwen2-vl"), InferenceDevice::Gpu) + .expect("qwen2-vl adapter selected"); + assert!(qwen2.supports_model("qwen2-vl")); + assert!(!qwen2.supports_model("qwen3.5")); + } } diff --git a/src/workers/continuum-core/src/inference/candle_adapter.rs b/src/workers/continuum-core/src/inference/candle_adapter.rs index f95f9ec04..01ed0e934 100644 --- a/src/workers/continuum-core/src/inference/candle_adapter.rs +++ b/src/workers/continuum-core/src/inference/candle_adapter.rs @@ -1,6 +1,8 @@ //! Candle Adapter - Local LLM Inference via AIProviderAdapter //! -//! Implements the AIProviderAdapter trait for local Candle inference. +//! Implements the AIProviderAdapter trait for explicit Candle training and +//! auxiliary inference paths. Runtime persona chat uses provider `local`, which +//! resolves through the Qwen/llama.cpp runtime instead of this adapter. //! Uses `ModelBackend` trait — no format-specific code paths. //! One backend, one generate function, works for GGUF and safetensors. //! @@ -20,6 +22,9 @@ use crate::ai::{ }; use crate::gpu::make_entry; use crate::gpu::memory_manager::{GpuAllocationGuard, GpuMemoryManager, GpuPriority, GpuSubsystem}; +use crate::model_registry::{ + find_first_local_gguf, resolve_gguf_for_model_id, resolve_local_model_dir_for_model_id, +}; use crate::runtime; use crate::system_resources::local_inference_capacity; @@ -38,7 +43,7 @@ struct BackendWrapper(Box); unsafe impl Send for BackendWrapper {} unsafe impl Sync for BackendWrapper {} -/// Candle adapter for local LLM inference. +/// Candle adapter for training/auxiliary LLM work. /// /// Holds a single `ModelBackend` — no ModelVariant enum, no format switches. /// The backend reports its own capabilities (context_length, architecture, etc.) @@ -84,7 +89,7 @@ impl CandleAdapter { name: "Candle Local".to_string(), base_url: String::new(), api_key_env: String::new(), - default_model: "unsloth/Llama-3.2-3B-Instruct".to_string(), + default_model: "continuum-ai/qwen3.5-4b-code-forged-GGUF".to_string(), timeout_ms: 300_000, max_retries: 1, retry_delay_ms: 0, @@ -425,7 +430,7 @@ fn inference_inner( log.info(&format!("Loading model: {}", resolved_model)); let model: Box = if use_quantized { load_default_quantized().map_err(|e| format!("Failed to load quantized model: {e}"))? - } else if let Some(local_dir) = find_local_model(resolved_model) { + } else if let Some(local_dir) = resolve_local_model_dir_for_model_id(resolved_model) { // Local GGUF model found — load from disk (no download needed) log.info(&format!("Found local model: {:?}", local_dir)); super::model::load_model_from_dir(&local_dir, resolved_model) @@ -1057,9 +1062,7 @@ const REGISTRY_JSON: &str = include_str!("../../../../shared/models.json"); fn load_full_registry() -> FullRegistry { serde_json::from_str(REGISTRY_JSON).unwrap_or_else(|e| { - runtime::logger("candle").error(&format!( - "Failed to parse src/shared/models.json: {e}" - )); + runtime::logger("candle").error(&format!("Failed to parse src/shared/models.json: {e}")); FullRegistry { models: HashMap::new(), tiers: HashMap::new(), @@ -1156,7 +1159,7 @@ pub fn resolve_model_id(requested: &str) -> String { return entry.hf_repo.clone(); } - // 3. Common alias pattern: 'smollm2-1.7b' → 'smollm2:1.7b'. + // 3. Common alias pattern: 'qwen2-0.5b' → 'qwen2:0.5b'. let dash_to_colon = normalized.replacen('-', ":", 1); if let Some(entry) = reg.models.get(&dash_to_colon) { return entry.hf_repo.clone(); @@ -1171,70 +1174,6 @@ pub fn resolve_model_id(requested: &str) -> String { requested.to_string() } -/// Resolve the storage root for large files (models, adapters, datasets). -/// Checks CONTINUUM_STORAGE_PATH from: env var → ~/.continuum/config.env → fallback ~/.continuum/. -fn storage_root() -> std::path::PathBuf { - // 1. Check env var first - if let Ok(storage) = std::env::var("CONTINUUM_STORAGE_PATH") { - if !storage.is_empty() { - return std::path::PathBuf::from(storage); - } - } - // 2. Check config.env (Secrets module skips non-secret keys like this) - if let Some(home) = dirs::home_dir() { - let config_path = home.join(".continuum").join("config.env"); - if let Ok(content) = std::fs::read_to_string(&config_path) { - for line in content.lines() { - let trimmed = line.trim(); - if let Some(value) = trimmed.strip_prefix("CONTINUUM_STORAGE_PATH=") { - let value = value.trim(); - if !value.is_empty() { - return std::path::PathBuf::from(value); - } - } - } - } - } - // 3. Default - let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".into()); - std::path::PathBuf::from(home).join(".continuum") -} - -/// Find the first available GGUF on disk for eager-load warmup. Scans the -/// HF cache (`~/.cache/huggingface/hub/models--*-GGUF/snapshots/*/*.gguf`) -/// and returns the first match. Used by `initialize()` to pick a sensible -/// default model when no specific request has come in yet. -fn find_first_local_gguf() -> Option { - let home = std::env::var("HOME").ok()?; - let hf_cache = std::path::PathBuf::from(&home).join(".cache/huggingface/hub"); - if !hf_cache.exists() { - return None; - } - for entry in std::fs::read_dir(&hf_cache).ok()?.flatten() { - let name = entry.file_name(); - let name_str = name.to_string_lossy(); - if !name_str.starts_with("models--") { - continue; - } - let snapshots = entry.path().join("snapshots"); - let Ok(snaps) = std::fs::read_dir(&snapshots) else { - continue; - }; - for snap in snaps.flatten() { - let Ok(files) = std::fs::read_dir(snap.path()) else { - continue; - }; - for f in files.flatten() { - let p = f.path(); - if p.extension().and_then(|s| s.to_str()) == Some("gguf") { - return Some(p); - } - } - } - } - None -} - /// Ensure the llama.cpp backend is loaded for `model_id`. Idempotent and /// safe for concurrent callers via `load_gate`. The actual `Model::load` /// runs in `spawn_blocking` because it is a synchronous C++ FFI call @@ -1258,7 +1197,7 @@ async fn ensure_llamacpp_loaded_async( return Ok(()); } let log = runtime::logger("candle"); - let gguf_path = find_local_gguf(model_id) + let gguf_path = resolve_gguf_for_model_id(model_id) .ok_or_else(|| format!( "No GGUF for model '{}'. Ensure the model is downloaded to ~/.continuum/genome/models or HF cache.", model_id @@ -1284,153 +1223,6 @@ async fn ensure_llamacpp_loaded_async( Ok(()) } -/// Check if a model is available locally as a GGUF. -/// Searches ~/.continuum/ (internal NVMe, fast) FIRST, then CONTINUUM_STORAGE_PATH (external, slow). -/// Returns the local directory path if found, None if not cached. -/// Find the .gguf file for a model, searching local dirs + HF cache. -/// Used by the llama.cpp backend which needs a GGUF file path directly. -fn find_local_gguf(model_id: &str) -> Option { - // Try local model dir first (via find_local_model) - if let Some(dir) = find_local_model(model_id) { - if let Ok(entries) = std::fs::read_dir(&dir) { - for entry in entries.flatten() { - let p = entry.path(); - if p.extension().and_then(|s| s.to_str()) == Some("gguf") { - return Some(p); - } - } - } - } - // Fall back to HF cache - let home = std::env::var("HOME").ok()?; - let hf_cache = std::path::PathBuf::from(&home).join(".cache/huggingface/hub"); - if !hf_cache.exists() { - return None; - } - for entry in std::fs::read_dir(&hf_cache).ok()?.flatten() { - let name = entry.file_name(); - let name_str = name.to_string_lossy(); - // Match "models--**" or a fuzzy match on slug - if name_str.starts_with("models--") - && name_str - .to_lowercase() - .contains(&model_id.to_lowercase().replace('/', "--")) - { - // Look inside snapshots// for a .gguf file - let snapshots = entry.path().join("snapshots"); - if let Ok(snaps) = std::fs::read_dir(&snapshots) { - for snap in snaps.flatten() { - if let Ok(files) = std::fs::read_dir(snap.path()) { - for f in files.flatten() { - let p = f.path(); - if p.extension().and_then(|s| s.to_str()) == Some("gguf") { - return Some(p); - } - } - } - } - } - } - } - None -} - -fn find_local_model(model_id: &str) -> Option { - let search_dirs = { - let mut dirs = Vec::new(); - // Internal drive first (NVMe = ~2s load vs external USB = ~105s) - let home = std::env::var("HOME").ok()?; - let home_models = std::path::PathBuf::from(&home).join(".continuum/genome/models"); - dirs.push(home_models.clone()); - // External/overflow storage second - let storage_models = storage_root().join("genome/models"); - if storage_models != home_models { - dirs.push(storage_models); - } - dirs - }; - - for models_dir in &search_dirs { - if !models_dir.exists() { - continue; - } - if let Some(found) = find_model_in_dir(model_id, models_dir) { - return Some(found); - } - } - None -} - -fn find_model_in_dir(model_id: &str, models_dir: &std::path::Path) -> Option { - if !models_dir.exists() { - return None; - } - - // Check for exact directory match (e.g., model dirs we created) - for entry in std::fs::read_dir(&models_dir).ok()? { - let entry = entry.ok()?; - let path = entry.path(); - if !path.is_dir() { - continue; - } - - // Check if this directory has a GGUF file + tokenizer - let has_gguf = std::fs::read_dir(&path) - .ok() - .map(|entries| { - entries.filter_map(|e| e.ok()).any(|e| { - e.path() - .extension() - .and_then(|ext| ext.to_str()) - .map(|ext| ext == "gguf") - .unwrap_or(false) - }) - }) - .unwrap_or(false); - - let has_tokenizer = path.join("tokenizer.json").exists(); - - if has_gguf && has_tokenizer { - // Match by directory name containing model ID parts - let dir_name = path.file_name()?.to_str()?.to_lowercase(); - let model_lower = model_id.to_lowercase(); - - // Match "continuum-ai/qwen2.5-coder-32b-compacted" against "qwen32b-compacted-v3" - // Must also match size indicator (14b, 32b) to avoid confusing 14B and 32B models - if model_lower.contains("qwen") - && model_lower.contains("compacted") - && dir_name.contains("qwen") - && dir_name.contains("compacted") - { - // Extract size indicator from model_id (e.g., "14b", "32b") - let size_match = ["14b", "32b", "7b", "3b", "1b"] - .iter() - .find(|s| model_lower.contains(*s)); - if let Some(size) = size_match { - // If model specifies a size, directory must also contain it - if dir_name.contains(size) { - return Some(path); - } - // Size mismatch — skip this directory - } else { - // No size in model_id — accept any match - return Some(path); - } - } - - // Generic: check if model_id's repo name appears in dir name - if let Some(repo_name) = model_id.split('/').last() { - let repo_lower = repo_name.to_lowercase().replace('.', ""); - if dir_name.contains(&repo_lower) { - return Some(path); - } - } - } - } - - None -} - /// Estimate VRAM usage for a LoRA adapter from its file path. /// Path may be a directory (containing adapter_model.safetensors) or a direct file. fn estimate_adapter_vram(path: &str) -> u64 { @@ -1460,11 +1252,11 @@ pub fn resolve_chat_template(requested_model: &str) -> String { if normalized.contains("qwen") { return "qwen2".to_string(); } - if normalized.contains("chatml") || normalized.contains("smollm") { + if normalized.contains("chatml") { return "chatml".to_string(); } - "llama3".to_string() + "qwen2".to_string() } /// Extract text content from a chat message. @@ -1653,8 +1445,8 @@ mod tests { assert_eq!(resolve_chat_template("qwen2-vl-7b"), "qwen2"); // Heuristic fallback: name-based inference for unknown models. assert_eq!(resolve_chat_template("some-qwen-thing"), "qwen2"); - assert_eq!(resolve_chat_template("smollm2-future"), "chatml"); - assert_eq!(resolve_chat_template("unknown-model"), "llama3"); // default fallback + assert_eq!(resolve_chat_template("chatml-future"), "chatml"); + assert_eq!(resolve_chat_template("unknown-model"), "qwen2"); // local default fallback } #[test] @@ -1664,8 +1456,14 @@ mod tests { // succeeds (non-passthrough) for tier-bound refs and that // model-bound refs always resolve to the same concrete model. let local = resolve_model_id("local-default"); - assert_ne!(local, "local-default", "local-default must resolve to a concrete repo"); - assert!(local.contains('/'), "resolved model must look like an HF repo: got {local}"); + assert_ne!( + local, "local-default", + "local-default must resolve to a concrete repo" + ); + assert!( + local.contains('/'), + "resolved model must look like an HF repo: got {local}" + ); let vision = resolve_model_id("vision-default"); assert_eq!(vision, "Qwen/Qwen2-VL-7B-Instruct-GGUF"); diff --git a/src/workers/continuum-core/src/inference/llamacpp_adapter.rs b/src/workers/continuum-core/src/inference/llamacpp_adapter.rs index 71eab80f6..ec55dcd11 100644 --- a/src/workers/continuum-core/src/inference/llamacpp_adapter.rs +++ b/src/workers/continuum-core/src/inference/llamacpp_adapter.rs @@ -153,7 +153,7 @@ pub struct LlamaCppAdapter { impl LlamaCppAdapter { /// Construct from the model_registry. Looks up the first model under - /// provider `llamacpp-local` that has a non-None `gguf_local_path` + /// provider `llamacpp-local` whose GGUF artifact resolved locally /// and uses its id + path. If the registry has no such row, panics /// — that's a config bug, not a runtime failure mode (per the /// no-fallback rule). @@ -271,8 +271,8 @@ impl LlamaCppAdapter { if !self.model_path.exists() { return Err(format!( "model GGUF not found at {:?} for model `{}` — \ - either pull the artifact to that path (it's the \ - `gguf_local_path` declared in config/models.toml) or \ + either pull the artifact identified by the registry \ + `gguf_hint` or \ override via with_model_path()", self.model_path, self.default_model, )); @@ -804,9 +804,6 @@ impl AIProviderAdapter for LlamaCppAdapter { } fn supports_model(&self, model_name: &str) -> bool { - let want = model_name.to_lowercase(); - models_for_provider_via_registry(LLAMACPP_PROVIDER_ID) - .iter() - .any(|m| m.id.to_lowercase() == want) + self.default_model.eq_ignore_ascii_case(model_name) } } diff --git a/src/workers/continuum-core/src/inference/model.rs b/src/workers/continuum-core/src/inference/model.rs index 6acf4cebf..f5e2feac3 100644 --- a/src/workers/continuum-core/src/inference/model.rs +++ b/src/workers/continuum-core/src/inference/model.rs @@ -1,12 +1,13 @@ //! Model Loading Utilities //! -//! Handles downloading models from HuggingFace Hub, loading them into -//! Candle, and LoRA weight merging. Model state lives in +//! Handles downloading curated training/auxiliary models from HuggingFace Hub, +//! loading them into Candle when explicitly requested, and LoRA weight merging. +//! Runtime persona chat uses the local Qwen/llama.cpp path. Model state lives in //! `backends::LlamaSafetensorsBackend` — this module provides the loading //! and utility functions. //! //! Supports: -//! - Llama architecture models (safetensors format) +//! - Qwen/Llama-family safetensors models for training/auxiliary use //! - BF16/FP32 precision //! - GPU acceleration (Metal/CUDA) //! - LoRA weight merging (single and multi-adapter) @@ -506,7 +507,7 @@ fn load_safetensors_from_config( pub fn load_default_model( ) -> Result, Box> { let model_id = std::env::var("INFERENCE_MODEL_ID") - .unwrap_or_else(|_| "unsloth/Llama-3.2-3B-Instruct".to_string()); + .unwrap_or_else(|_| "continuum-ai/qwen3.5-4b-code-forged-GGUF".to_string()); load_model_by_id(&model_id) } diff --git a/src/workers/continuum-core/src/inference/quantized.rs b/src/workers/continuum-core/src/inference/quantized.rs index 709f6d8a0..6075b75d8 100644 --- a/src/workers/continuum-core/src/inference/quantized.rs +++ b/src/workers/continuum-core/src/inference/quantized.rs @@ -114,8 +114,8 @@ pub fn load_quantized_model( let tokenizer_sources = vec![ tokenizer_repo.to_string(), - "unsloth/Llama-3.2-3B-Instruct".to_string(), - "unsloth/Meta-Llama-3.1-8B-Instruct".to_string(), + "continuum-ai/qwen3.5-4b-code-forged-GGUF".to_string(), + "Qwen/Qwen2-VL-7B-Instruct-GGUF".to_string(), ]; let mut tokenizer: Option = None; diff --git a/src/workers/continuum-core/src/model_registry/artifacts.rs b/src/workers/continuum-core/src/model_registry/artifacts.rs new file mode 100644 index 000000000..fdc629adf --- /dev/null +++ b/src/workers/continuum-core/src/model_registry/artifacts.rs @@ -0,0 +1,412 @@ +//! Local model artifact resolution. +//! +//! The registry owns model identity and artifact hints; this module owns +//! filesystem discovery for those artifacts. Adapters must consume resolved +//! paths from here instead of guessing cache layouts privately. + +use super::types::Model; +use std::fs; +use std::path::{Path, PathBuf}; + +pub fn resolve_model_artifacts(model: &mut Model) { + model.gguf_local_path = resolve_gguf_for_model(model); + if let Some(p) = model.mmproj_local_path.take() { + model.mmproj_local_path = Some(expand_user_path(&p)); + } +} + +pub fn resolve_gguf_for_model(model: &Model) -> Option { + resolve_gguf( + &model.id, + model.gguf_hint.as_deref(), + model.gguf_local_path.as_deref(), + ) +} + +pub fn resolve_gguf_for_model_id(model_id: &str) -> Option { + if let Some(registry) = crate::model_registry::try_global() { + if let Some(model) = registry.model(model_id) { + return resolve_gguf_for_model(model); + } + } + resolve_gguf(model_id, None, None) +} + +pub fn resolve_local_model_dir_for_model_id(model_id: &str) -> Option { + resolve_from_local_model_roots(model_id).and_then(|gguf| gguf.parent().map(Path::to_path_buf)) +} + +pub fn find_first_local_gguf() -> Option { + let mut candidates = Vec::new(); + for dir in local_model_roots() { + collect_ggufs_recursive(&dir, &mut candidates); + } + if let Some(cache) = huggingface_cache_root() { + collect_ggufs_recursive(&cache, &mut candidates); + } + pick_best_candidate(candidates) +} + +pub fn expand_user_path(p: &Path) -> PathBuf { + let s = p.to_string_lossy(); + let home = home_dir_string(); + if let Some(home) = home { + if let Some(rest) = s.strip_prefix("~/") { + return PathBuf::from(format!("{home}/{rest}")); + } + if s == "~" { + return PathBuf::from(home); + } + if let Some(rest) = s.strip_prefix("$HOME/") { + return PathBuf::from(format!("{home}/{rest}")); + } + if let Some(rest) = s.strip_prefix("%USERPROFILE%/") { + return PathBuf::from(format!("{home}/{rest}")); + } + if let Some(rest) = s.strip_prefix("%USERPROFILE%\\") { + return PathBuf::from(format!("{home}\\{rest}")); + } + } + p.to_path_buf() +} + +fn resolve_gguf(model_id: &str, hint: Option<&str>, explicit: Option<&Path>) -> Option { + if let Some(path) = explicit { + let expanded = expand_user_path(path); + if expanded.exists() { + return Some(expanded); + } + } + + if let Some(path) = resolve_from_local_model_roots(model_id) { + return Some(path); + } + + if let Some(hint) = hint { + if let Some(path) = resolve_from_huggingface_hint(hint) { + return Some(path); + } + } + + resolve_from_huggingface_model_id(model_id) +} + +fn resolve_from_local_model_roots(model_id: &str) -> Option { + for root in local_model_roots() { + if let Some(dir) = find_model_dir_in_root(model_id, &root) { + if let Some(gguf) = first_gguf_in_dir(&dir) { + return Some(gguf); + } + } + } + None +} + +fn local_model_roots() -> Vec { + let mut roots = Vec::new(); + if let Some(home) = home_dir_string() { + roots.push( + PathBuf::from(&home) + .join(".continuum") + .join("genome") + .join("models"), + ); + } + let storage_models = storage_root().join("genome").join("models"); + if !roots.iter().any(|p| p == &storage_models) { + roots.push(storage_models); + } + roots +} + +fn storage_root() -> PathBuf { + if let Ok(storage) = std::env::var("CONTINUUM_STORAGE_PATH") { + if !storage.trim().is_empty() { + return PathBuf::from(storage); + } + } + if let Some(home) = home_dir_string() { + let config_path = PathBuf::from(&home).join(".continuum").join("config.env"); + if let Ok(content) = fs::read_to_string(config_path) { + for line in content.lines() { + if let Some(value) = line.trim().strip_prefix("CONTINUUM_STORAGE_PATH=") { + let value = value.trim(); + if !value.is_empty() { + return PathBuf::from(value); + } + } + } + } + return PathBuf::from(home).join(".continuum"); + } + PathBuf::from("/tmp").join(".continuum") +} + +fn find_model_dir_in_root(model_id: &str, root: &Path) -> Option { + if !root.exists() { + return None; + } + + for entry in fs::read_dir(root).ok()?.flatten() { + let path = entry.path(); + if !path.is_dir() || first_gguf_in_dir(&path).is_none() { + continue; + } + let dir_name = path.file_name()?.to_str()?.to_lowercase(); + let model_lower = model_id.to_lowercase(); + if model_lower.contains("qwen") + && model_lower.contains("compacted") + && dir_name.contains("qwen") + && dir_name.contains("compacted") + { + let size_match = ["14b", "32b", "7b", "4b", "3b", "1b"] + .iter() + .find(|s| model_lower.contains(*s)); + if let Some(size) = size_match { + if dir_name.contains(size) { + return Some(path); + } + } else { + return Some(path); + } + } + if let Some(repo_name) = model_id.split('/').next_back() { + let repo_lower = repo_name.to_lowercase().replace('.', ""); + if dir_name.contains(&repo_lower) { + return Some(path); + } + } + } + None +} + +fn resolve_from_huggingface_hint(hint: &str) -> Option { + let repo_slug = hf_repo_slug(hint)?; + let cache = huggingface_cache_root()?; + let model_dir = find_hf_model_dir(&cache, &repo_slug)?; + find_ggufs_under_snapshots(&model_dir) +} + +fn resolve_from_huggingface_model_id(model_id: &str) -> Option { + let cache = huggingface_cache_root()?; + let wanted = model_id.to_lowercase().replace('/', "--"); + let mut candidates = Vec::new(); + for entry in fs::read_dir(cache).ok()?.flatten() { + let name = entry.file_name().to_string_lossy().to_lowercase(); + if name.starts_with("models--") && name.contains(&wanted) { + if let Some(gguf) = find_ggufs_under_snapshots(&entry.path()) { + candidates.push(gguf); + } + } + } + pick_best_candidate(candidates) +} + +fn hf_repo_slug(hint: &str) -> Option { + let trimmed = hint + .strip_prefix("huggingface.co/") + .unwrap_or(hint) + .split(':') + .next()? + .trim_matches('/'); + let parts: Vec<&str> = trimmed.split('/').filter(|part| !part.is_empty()).collect(); + if parts.len() < 2 { + return None; + } + Some(format!( + "{}--{}", + parts[parts.len() - 2], + parts[parts.len() - 1] + )) +} + +fn huggingface_cache_root() -> Option { + if let Ok(hf_home) = std::env::var("HF_HOME") { + if !hf_home.trim().is_empty() { + return Some(PathBuf::from(hf_home).join("hub")); + } + } + Some( + PathBuf::from(home_dir_string()?) + .join(".cache") + .join("huggingface") + .join("hub"), + ) +} + +fn find_hf_model_dir(cache: &Path, repo_slug: &str) -> Option { + let wanted = format!("models--{}", repo_slug).to_lowercase(); + for entry in fs::read_dir(cache).ok()?.flatten() { + let name = entry.file_name().to_string_lossy().to_lowercase(); + if name == wanted { + return Some(entry.path()); + } + } + None +} + +fn find_ggufs_under_snapshots(model_dir: &Path) -> Option { + let snapshots = model_dir.join("snapshots"); + let mut candidates = Vec::new(); + for snap in fs::read_dir(snapshots).ok()?.flatten() { + let Ok(files) = fs::read_dir(snap.path()) else { + continue; + }; + for file in files.flatten() { + let p = file.path(); + if is_gguf(&p) { + candidates.push(p); + } + } + } + pick_best_candidate(candidates) +} + +fn collect_ggufs_recursive(dir: &Path, out: &mut Vec) { + let Ok(entries) = fs::read_dir(dir) else { + return; + }; + for entry in entries.flatten() { + let p = entry.path(); + if p.is_dir() { + collect_ggufs_recursive(&p, out); + } else if is_gguf(&p) { + out.push(p); + } + } +} + +fn first_gguf_in_dir(dir: &Path) -> Option { + let mut candidates = Vec::new(); + for entry in fs::read_dir(dir).ok()?.flatten() { + let p = entry.path(); + if is_gguf(&p) { + candidates.push(p); + } + } + pick_best_candidate(candidates) +} + +fn pick_best_candidate(mut candidates: Vec) -> Option { + candidates.sort_by(|a, b| { + let ma = fs::metadata(a).and_then(|m| m.modified()).ok(); + let mb = fs::metadata(b).and_then(|m| m.modified()).ok(); + mb.cmp(&ma).then_with(|| a.cmp(b)) + }); + candidates.into_iter().next() +} + +fn is_gguf(path: &Path) -> bool { + path.extension() + .and_then(|s| s.to_str()) + .is_some_and(|ext| ext.eq_ignore_ascii_case("gguf")) +} + +fn home_dir_string() -> Option { + std::env::var("HOME") + .ok() + .or_else(|| std::env::var("USERPROFILE").ok()) +} + +#[cfg(test)] +pub(crate) fn with_test_home(home: &Path, f: impl FnOnce() -> T) -> T { + use std::sync::{Mutex, OnceLock}; + + static ENV_LOCK: OnceLock> = OnceLock::new(); + let _guard = ENV_LOCK + .get_or_init(|| Mutex::new(())) + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + let prior_home = std::env::var("HOME").ok(); + let prior_userprofile = std::env::var("USERPROFILE").ok(); + let prior_hf_home = std::env::var("HF_HOME").ok(); + std::env::set_var("HOME", home); + std::env::remove_var("USERPROFILE"); + std::env::remove_var("HF_HOME"); + let result = f(); + if let Some(value) = prior_home { + std::env::set_var("HOME", value); + } else { + std::env::remove_var("HOME"); + } + if let Some(value) = prior_userprofile { + std::env::set_var("USERPROFILE", value); + } else { + std::env::remove_var("USERPROFILE"); + } + if let Some(value) = prior_hf_home { + std::env::set_var("HF_HOME", value); + } else { + std::env::remove_var("HF_HOME"); + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::model_registry::types::{Arch, Capability}; + use std::collections::BTreeSet; + + fn model(id: &str, hint: Option<&str>, explicit: Option) -> Model { + Model { + id: id.to_string(), + name: None, + provider: "llamacpp-local".into(), + arch: Arch::Qwen35, + context_window: 262144, + max_output_tokens: 32768, + tokens_per_second: 33.0, + capabilities: BTreeSet::from([ + Capability::TextGeneration, + Capability::Chat, + Capability::ToolUse, + ]), + cost_input_per_1k: 0.0, + cost_output_per_1k: 0.0, + gguf_hint: hint.map(str::to_string), + gguf_local_path: explicit, + mmproj_local_path: None, + chat_template: None, + multi_party_strategy: Default::default(), + stop_sequences: Vec::new(), + } + } + + #[test] + fn resolves_huggingface_cache_from_hint_when_explicit_path_is_stale() { + let home = tempfile::tempdir().unwrap(); + with_test_home(home.path(), || { + let cached = home.path().join( + ".cache/huggingface/hub/models--continuum-ai--qwen3.5-4b-code-forged-GGUF/snapshots/abc", + ); + fs::create_dir_all(&cached).unwrap(); + let gguf = cached.join("qwen3.5-4b-code-forged-Q4_K_M.gguf"); + fs::write(&gguf, b"gguf").unwrap(); + + let resolved = resolve_gguf_for_model(&model( + "continuum-ai/qwen3.5-4b-code-forged-GGUF", + Some("huggingface.co/continuum-ai/qwen3.5-4b-code-forged-gguf"), + Some(PathBuf::from("~/missing/docker/bundle/model.gguf")), + )); + + assert_eq!(resolved.as_deref(), Some(gguf.as_path())); + }); + } + + #[test] + fn explicit_existing_path_wins() { + let home = tempfile::tempdir().unwrap(); + with_test_home(home.path(), || { + let explicit = home.path().join("models").join("model.gguf"); + fs::create_dir_all(explicit.parent().unwrap()).unwrap(); + fs::write(&explicit, b"gguf").unwrap(); + let resolved = resolve_gguf_for_model(&model( + "continuum-ai/qwen3.5-4b-code-forged-GGUF", + Some("huggingface.co/continuum-ai/qwen3.5-4b-code-forged-gguf"), + Some(PathBuf::from("~/models/model.gguf")), + )); + assert_eq!(resolved.as_deref(), Some(explicit.as_path())); + }); + } +} diff --git a/src/workers/continuum-core/src/model_registry/loader.rs b/src/workers/continuum-core/src/model_registry/loader.rs index 057b770b2..f0c2a7e60 100644 --- a/src/workers/continuum-core/src/model_registry/loader.rs +++ b/src/workers/continuum-core/src/model_registry/loader.rs @@ -1,6 +1,6 @@ //! Registry loader — parses `models.toml` + `providers.toml` into typed //! `Model` / `Provider` records, validates cross-references, and -//! resolves local GGUF paths from DMR's on-disk manifest when possible. +//! resolves local GGUF paths from each model's canonical `gguf_hint`. //! //! Entry points: //! - [`load_registry`] — single call, returns a validated `Registry`. @@ -10,6 +10,7 @@ //! `provider` doesn't resolve to a registered `Provider` — each gets its //! own variant so the caller's logs pinpoint the issue. +use super::artifacts::{expand_user_path, resolve_model_artifacts}; use super::types::{Model, Provider}; use serde::Deserialize; use std::collections::HashMap; @@ -127,9 +128,10 @@ pub fn load_providers(path: impl AsRef) -> Result, RegistryE /// - no duplicate provider ids /// - every `Model.provider` resolves to a registered provider /// -/// Does NOT attempt to resolve `gguf_local_path` — that's a DMR-manifest -/// concern handled after load. See [`resolve_local_gguf_paths`] for the -/// optional post-load pass that does it. +/// Resolves local GGUF paths from either an explicit `gguf_local_path` or the +/// Hugging Face cache implied by `gguf_hint`. A hand-pinned local path is only +/// authoritative when it exists; stale machine-specific Docker bundle paths +/// must not make an already-downloaded model invisible. pub fn load_registry( models_path: impl AsRef, providers_path: impl AsRef, @@ -156,70 +158,13 @@ pub fn load_registry( provider_id: m.provider, }); } - // Expand `~` / `$HOME` in gguf_local_path so TOML authors can - // write portable paths. Done here (at load) rather than at every - // read site so the stored PathBuf is already absolute. - if let Some(p) = m.gguf_local_path.take() { - m.gguf_local_path = Some(expand_path(&p)); - } - // Same expansion for the multimodal projector path — added with - // the Qwen2-VL-7B vision row 2026-04-21. Without this the local - // mtmd path would fail to find `~/models/...` paths the same way - // gguf_local_path used to before its expansion was added. - if let Some(p) = m.mmproj_local_path.take() { - m.mmproj_local_path = Some(expand_path(&p)); - } + resolve_model_artifacts(&mut m); models.insert(m.id.clone(), m); } Ok(Registry { models, providers }) } -/// Expand `~` / `$HOME` (Unix) or `%USERPROFILE%` (Windows) prefixes in -/// a path so the stored value is absolute. Anything that doesn't start -/// with one of those prefixes is returned unchanged. No recursive -/// env-var interpolation — deliberately narrow so a typo in TOML -/// produces a literal-looking bad path rather than something shell- -/// interpreted. -/// -/// Cross-platform note: `~` works on Windows shells too because -/// PowerShell + cmd accept it via TildeExpansion in many contexts, but -/// our TOML is read as raw text — we have to do the expansion ourselves -/// against `USERPROFILE` (Windows convention) when `HOME` isn't set. -/// Without this, Windows installs that follow the Carl/Dev install path -/// will fail to find any TOML row that uses `~/models/...` (which is -/// the convention we use throughout config/models.toml). -fn expand_path(p: &Path) -> PathBuf { - let s = p.to_string_lossy(); - // Resolve home from HOME (Unix) or USERPROFILE (Windows). HOME is - // checked first because some Windows dev environments (Git Bash, - // WSL) set it; otherwise fall through to USERPROFILE. - let home = std::env::var("HOME") - .ok() - .or_else(|| std::env::var("USERPROFILE").ok()); - if let Some(home) = home { - if let Some(rest) = s.strip_prefix("~/") { - return PathBuf::from(format!("{home}/{rest}")); - } - if s == "~" { - return PathBuf::from(home); - } - if let Some(rest) = s.strip_prefix("$HOME/") { - return PathBuf::from(format!("{home}/{rest}")); - } - // Windows-style: %USERPROFILE%/... — uncommon in TOML written - // by Unix-leaning devs but supported so a Windows operator - // editing config/models.toml in their native style works too. - if let Some(rest) = s.strip_prefix("%USERPROFILE%/") { - return PathBuf::from(format!("{home}/{rest}")); - } - if let Some(rest) = s.strip_prefix("%USERPROFILE%\\") { - return PathBuf::from(format!("{home}\\{rest}")); - } - } - p.to_path_buf() -} - #[cfg(test)] mod tests { use super::*; @@ -378,6 +323,53 @@ auth = "none" ); } + #[test] + fn resolves_gguf_hint_from_huggingface_cache_when_local_path_absent_or_stale() { + let dir = tempfile::tempdir().unwrap(); + let home = tempfile::tempdir().unwrap(); + crate::model_registry::artifacts::with_test_home(home.path(), || { + let cached = home + .path() + .join(".cache/huggingface/hub/models--continuum-ai--qwen3.5-4b-code-forged-GGUF/snapshots/abc"); + fs::create_dir_all(&cached).unwrap(); + let gguf = cached.join("qwen3.5-4b-code-forged-Q4_K_M.gguf"); + fs::write(&gguf, b"gguf").unwrap(); + + let mp = write( + dir.path(), + "models.toml", + r#" +[[model]] +id = "continuum-ai/qwen3.5-4b-code-forged-GGUF" +provider = "llamacpp-local" +arch = "qwen35" +context_window = 262144 +max_output_tokens = 32768 +tokens_per_second = 33.0 +capabilities = ["text-generation", "chat", "tool-use"] +gguf_hint = "huggingface.co/continuum-ai/qwen3.5-4b-code-forged-gguf" +gguf_local_path = "~/missing/docker/bundle/model.gguf" +"#, + ); + let pp = write( + dir.path(), + "providers.toml", + r#" +[[provider]] +id = "llamacpp-local" +base_url = "local://llamacpp" +auth = "none" +"#, + ); + + let reg = load_registry(mp, pp).expect("registry should load"); + let model = reg + .model("continuum-ai/qwen3.5-4b-code-forged-GGUF") + .expect("model registered"); + assert_eq!(model.gguf_local_path.as_deref(), Some(gguf.as_path())); + }); + } + #[test] fn real_config_files_parse_and_validate() { // The actual seeded files in the repo must always parse and @@ -424,35 +416,30 @@ auth = "none" #[test] fn expand_path_handles_home_prefixes() { - // Save current HOME to restore at the end — other tests share the env. - let prior = std::env::var("HOME").ok(); - std::env::set_var("HOME", "/tmp/fake-home"); - - assert_eq!( - expand_path(Path::new("~/models/foo.gguf")), - PathBuf::from("/tmp/fake-home/models/foo.gguf"), - ); - assert_eq!(expand_path(Path::new("~")), PathBuf::from("/tmp/fake-home")); - assert_eq!( - expand_path(Path::new("$HOME/bar.gguf")), - PathBuf::from("/tmp/fake-home/bar.gguf"), - ); - // Literal absolute path untouched. - assert_eq!( - expand_path(Path::new("/opt/models/x.gguf")), - PathBuf::from("/opt/models/x.gguf"), - ); - // Literal relative path untouched — we only expand `~` / `$HOME`. - assert_eq!( - expand_path(Path::new("models/x.gguf")), - PathBuf::from("models/x.gguf"), - ); - - if let Some(h) = prior { - std::env::set_var("HOME", h); - } else { - std::env::remove_var("HOME"); - } + crate::model_registry::artifacts::with_test_home(Path::new("/tmp/fake-home"), || { + assert_eq!( + expand_user_path(Path::new("~/models/foo.gguf")), + PathBuf::from("/tmp/fake-home/models/foo.gguf"), + ); + assert_eq!( + expand_user_path(Path::new("~")), + PathBuf::from("/tmp/fake-home") + ); + assert_eq!( + expand_user_path(Path::new("$HOME/bar.gguf")), + PathBuf::from("/tmp/fake-home/bar.gguf"), + ); + // Literal absolute path untouched. + assert_eq!( + expand_user_path(Path::new("/opt/models/x.gguf")), + PathBuf::from("/opt/models/x.gguf"), + ); + // Literal relative path untouched — we only expand `~` / `$HOME`. + assert_eq!( + expand_user_path(Path::new("models/x.gguf")), + PathBuf::from("models/x.gguf"), + ); + }); } #[test] diff --git a/src/workers/continuum-core/src/model_registry/mod.rs b/src/workers/continuum-core/src/model_registry/mod.rs index 1b853596a..6d7763b5e 100644 --- a/src/workers/continuum-core/src/model_registry/mod.rs +++ b/src/workers/continuum-core/src/model_registry/mod.rs @@ -19,10 +19,15 @@ //! variant AND a TOML row — but the TOML rows for existing arches //! remain unaffected. +pub mod artifacts; pub mod loader; pub mod singleton; pub mod types; +pub use artifacts::{ + find_first_local_gguf, resolve_gguf_for_model, resolve_gguf_for_model_id, + resolve_local_model_dir_for_model_id, +}; pub use loader::{load_models, load_providers, load_registry, Registry, RegistryError}; pub use singleton::{global, init_global, try_global}; pub use types::{Arch, AuthKind, Capability, Model, Provider}; diff --git a/src/workers/continuum-core/src/model_registry/types.rs b/src/workers/continuum-core/src/model_registry/types.rs index b46eff621..42eb461b9 100644 --- a/src/workers/continuum-core/src/model_registry/types.rs +++ b/src/workers/continuum-core/src/model_registry/types.rs @@ -43,7 +43,9 @@ pub enum Arch { /// the `cognition/respond` IPC payload both carry capability vocab as /// a list of these values. TS hosts read/write the same kebab-case /// strings serde produces. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, ts_rs::TS)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, ts_rs::TS, +)] #[ts( export, export_to = "../../../shared/generated/model_registry/Capability.ts" @@ -181,9 +183,10 @@ pub struct Model { #[serde(default)] pub gguf_hint: Option, /// Resolved local filesystem path to the GGUF. Populated at registry - /// load by the loader (via DMR manifest lookup from `gguf_hint`), - /// NOT by the TOML author. TOML may leave this absent; the loader - /// fills it if the GGUF is pulled locally. + /// load by the artifact resolver from `gguf_hint`, local model roots, + /// or an explicit path if one exists. TOML should normally leave this + /// absent for portable models; the loader fills it when the artifact is + /// already pulled locally. #[serde(default)] pub gguf_local_path: Option, /// Local filesystem path to the multimodal projector GGUF (mmproj). diff --git a/src/workers/continuum-core/src/modules/ai_provider.rs b/src/workers/continuum-core/src/modules/ai_provider.rs index b387db403..351c276f3 100644 --- a/src/workers/continuum-core/src/modules/ai_provider.rs +++ b/src/workers/continuum-core/src/modules/ai_provider.rs @@ -325,7 +325,8 @@ impl AIProviderModule { for model_meta in reg_arc.models_for_provider(crate::inference::LLAMACPP_PROVIDER_ID) { let Some(gguf_path) = model_meta.gguf_local_path.clone() else { self.log().info(&format!( - "Skipping in-process adapter for `{}` — no gguf_local_path in TOML", + "Skipping in-process adapter for `{}` — artifact resolver found no local GGUF. \ + Pull the model identified by gguf_hint or run the model download flow.", model_meta.id )); continue; diff --git a/src/workers/continuum-core/src/persona/allocator.rs b/src/workers/continuum-core/src/persona/allocator.rs index ff97e1477..edcbde67b 100644 --- a/src/workers/continuum-core/src/persona/allocator.rs +++ b/src/workers/continuum-core/src/persona/allocator.rs @@ -7,11 +7,9 @@ //! Rust owns the decision; TypeScript calls `persona/allocate` IPC and uses the result. //! //! Allocation strategy — per-persona tiered model selection: -//! 32GB+ CUDA (5090): CodeReview(32B/20GB) + Teacher(14B/9GB) + Helper(8B/5GB) + Local(3B/3GB) -//! 24-31GB Metal (M-Max): Teacher(14B/9GB) + Helper(8B/5GB) + Local(3B/3GB) -//! 16-23GB Metal (M-Pro): Teacher(8B/5GB) + Helper(3B/3GB) + Local(3B/3GB) -//! 8-15GB (MacBook Air): Helper(3B/3GB) -//! <8GB / CPU: Helper(3B/3GB, CPU mode) +//! 32GB+ unified/VRAM: shared Qwen3.5 text personas + Qwen2-VL vision +//! 16GB+ unified/VRAM: shared Qwen3.5 text personas, vision when budget allows +//! <16GB / CPU: reduced local fleet selected from the same Qwen catalog //! + per cloud API key: One persona per key (0GB VRAM) use serde::{Deserialize, Serialize}; @@ -139,16 +137,8 @@ const SYSTEM_RESERVE_GB: f64 = 2.0; /// Select the best local model given total VRAM (system-wide default). /// Thresholds use 0.5GB margin — GPUs report slightly less than nominal /// (e.g. RTX 5090 "32GB" reports 31.84GB). -pub fn select_local_model(vram_gb: f64) -> &'static str { - if vram_gb >= 31.0 { - "coder-32b" // 32B compacted — SOTA for 5090/A100 - } else if vram_gb >= 15.0 { - "coder" // 14B compacted — fits MacBook Pro 16GB+ - } else if vram_gb >= 8.0 { - "unsloth/Llama-3.1-8B-Instruct" - } else { - "unsloth/Llama-3.2-3B-Instruct" - } +pub fn select_local_model(_vram_gb: f64) -> &'static str { + "continuum-ai/qwen3.5-4b-code-forged-GGUF" } /// Detect GPU type from the manager's device name. @@ -197,10 +187,9 @@ pub fn allocate( let gpu_name = gpu_manager.gpu_name().to_string(); let gpu_type = detect_gpu_type(&gpu_name).to_string(); - // In CPU mode (no GPU / Docker without GPU passthrough), use system RAM as - // the memory budget. Candle inference runs on CPU using system RAM — the VRAM - // field is zero but we still have memory to work with. Reserve 4GB for OS + - // Docker overhead, use the rest for models. + // In CPU/container mode (no GPU / Docker without GPU passthrough), use + // system RAM as the memory budget. Runtime local chat is llama.cpp/Qwen, + // not Candle; Candle remains a training/auxiliary concern. let system_ram_gb = { #[cfg(target_os = "linux")] { @@ -272,8 +261,6 @@ pub fn allocate( let has_api_key = |env_var: &str| -> bool { available_api_keys.iter().any(|k| k == env_var) }; - let mut any_candle_allocated = false; - for entry in catalog { let mut allocation = PersonaAllocation { unique_id: entry.unique_id.clone(), @@ -304,11 +291,11 @@ pub fn allocate( continue; } - // Local candle inference: check memory budget (VRAM or system RAM). + // Local llama.cpp/Qwen inference: check memory budget (VRAM/unified/RAM). // Model sharing: if two personas use the same model, the model loads ONCE. // The second persona's cost is ~0 (just config overhead). This means a - // 24GB Docker container can run 4+ candle personas off one 3GB model. - if entry.provider == "candle" { + // 24GB Docker container can run multiple local personas off one model. + if entry.provider == "local" { let resolved = resolve_model_for_persona(entry, effective_memory_gb, &local_model); let model_name = resolved.model.clone(); let needed_gb = resolved.vram_budget_gb; @@ -340,7 +327,6 @@ pub fn allocate( models_loaded.insert(model_name, needed_gb); } vram_allocated_gb += additional_cost; - any_candle_allocated = true; allocations.push(allocation); } else { allocation.reason = format!( @@ -462,14 +448,10 @@ mod tests { #[test] fn test_select_local_model() { - assert_eq!(select_local_model(32.0), "coder-32b"); - assert_eq!(select_local_model(48.0), "coder-32b"); - assert_eq!(select_local_model(31.84), "coder-32b"); // RTX 5090 reports 31.84 - assert_eq!(select_local_model(24.0), "coder"); - assert_eq!(select_local_model(16.0), "coder"); - assert_eq!(select_local_model(15.5), "coder"); - assert_eq!(select_local_model(8.0), "unsloth/Llama-3.1-8B-Instruct"); - assert_eq!(select_local_model(4.0), "unsloth/Llama-3.2-3B-Instruct"); + assert_eq!(select_local_model(32.0), "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(select_local_model(48.0), "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(select_local_model(16.0), "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(select_local_model(4.0), "continuum-ai/qwen3.5-4b-code-forged-GGUF"); } #[test] @@ -505,14 +487,14 @@ mod tests { let catalog = load_catalog(); let result = allocate(&manager, &[], &catalog); - // Should always create at least one candle persona (CPU fallback) - let candle_count = result + // Should always create at least one local persona. + let local_count = result .allocations .iter() - .filter(|a| a.provider == "candle") + .filter(|a| a.provider == "local") .count(); assert!( - candle_count >= 1, + local_count >= 1, "Should create at least one local persona" ); @@ -520,7 +502,7 @@ mod tests { let cloud_count = result .allocations .iter() - .filter(|a| a.api_key_env.is_some() && a.provider != "candle") + .filter(|a| a.api_key_env.is_some() && a.provider != "local") .count(); assert_eq!( cloud_count, 0, @@ -551,7 +533,7 @@ mod tests { let entry = PersonaCatalogEntry { unique_id: "codereview".to_string(), display_name: "CodeReview AI".to_string(), - provider: "candle".to_string(), + provider: "local".to_string(), persona_type: "persona".to_string(), voice_id: None, model_id: Some("coder".to_string()), @@ -564,31 +546,31 @@ mod tests { model_preferences: vec![ ModelPreference { min_vram_gb: 32.0, - model: "coder-32b".to_string(), + model: "continuum-ai/qwen3.5-27b-code-forged".to_string(), vram_budget_gb: 20.0, }, ModelPreference { min_vram_gb: 16.0, - model: "coder".to_string(), - vram_budget_gb: 9.0, + model: "continuum-ai/qwen3.5-4b-code-forged-GGUF".to_string(), + vram_budget_gb: 3.0, }, ], }; - // 32GB → gets 32B model - let r = resolve_model_for_persona(&entry, 32.0, "coder-32b"); - assert_eq!(r.model, "coder-32b"); + // 32GB → gets larger Qwen3.5 model when catalog permits + let r = resolve_model_for_persona(&entry, 32.0, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(r.model, "continuum-ai/qwen3.5-27b-code-forged"); assert_eq!(r.vram_budget_gb, 20.0); - // 24GB → gets 14B model (32B doesn't fit tier) - let r = resolve_model_for_persona(&entry, 24.0, "coder"); - assert_eq!(r.model, "coder"); - assert_eq!(r.vram_budget_gb, 9.0); + // 24GB → gets forged Qwen3.5 default + let r = resolve_model_for_persona(&entry, 24.0, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(r.model, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(r.vram_budget_gb, 3.0); // 8GB → falls to lowest preference - let r = resolve_model_for_persona(&entry, 8.0, "unsloth/Llama-3.1-8B-Instruct"); - assert_eq!(r.model, "coder"); - assert_eq!(r.vram_budget_gb, 9.0); + let r = resolve_model_for_persona(&entry, 8.0, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(r.model, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(r.vram_budget_gb, 3.0); } #[test] @@ -596,10 +578,10 @@ mod tests { let entry = PersonaCatalogEntry { unique_id: "helper".to_string(), display_name: "Helper AI".to_string(), - provider: "candle".to_string(), + provider: "local".to_string(), persona_type: "persona".to_string(), voice_id: None, - model_id: Some("unsloth/Llama-3.2-3B-Instruct".to_string()), + model_id: Some("continuum-ai/qwen3.5-4b-code-forged-GGUF".to_string()), is_audio_native: false, api_key_env: None, min_vram_gb: Some(3.0), @@ -609,8 +591,8 @@ mod tests { model_preferences: vec![], // No preferences → legacy path }; - let r = resolve_model_for_persona(&entry, 32.0, "coder-32b"); - assert_eq!(r.model, "unsloth/Llama-3.2-3B-Instruct"); + let r = resolve_model_for_persona(&entry, 32.0, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); + assert_eq!(r.model, "continuum-ai/qwen3.5-4b-code-forged-GGUF"); assert_eq!(r.vram_budget_gb, 3.0); } @@ -628,12 +610,27 @@ mod tests { "CodeReview should have model_preferences in catalog.json" ); - // Verify highest tier is first + // Verify local runtime uses the Qwen registry, not legacy training backends. let first = &codereview.model_preferences[0]; - assert!( - first.min_vram_gb >= 31.0, - "First preference should be for 31GB+ (was {}GB)", - first.min_vram_gb + assert_eq!( + codereview.provider, "local", + "Runtime persona provider must be local, not training backend" + ); + assert_eq!( + first.model, + "continuum-ai/qwen3.5-4b-code-forged-GGUF", + "CodeReview should use the Qwen3.5 local registry default" + ); + + let vision = catalog + .iter() + .find(|e| e.unique_id == "vision") + .expect("Vision AI should be in the Rust persona catalog"); + assert_eq!(vision.provider, "local"); + assert_eq!( + vision.model_preferences[0].model, + "qwen2-vl-7b-instruct", + "Vision AI should use the Qwen2-VL local registry default" ); } @@ -646,31 +643,30 @@ mod tests { let catalog = load_catalog(); let result = allocate(&manager, &[], &catalog); - // Find candle personas - let candle: Vec<_> = result + // Find local personas + let local: Vec<_> = result .allocations .iter() - .filter(|a| a.provider == "candle") + .filter(|a| a.provider == "local") .collect(); - assert!(!candle.is_empty(), "Should have candle personas"); + assert!(!local.is_empty(), "Should have local personas"); - // CodeReview should get coder-32b on 5090 - if let Some(cr) = candle.iter().find(|a| a.unique_id == "codereview") { + // CodeReview should get the shared Qwen3.5 local default. + if let Some(cr) = local.iter().find(|a| a.unique_id == "codereview") { assert_eq!( cr.resolved_model.as_deref(), - Some("coder-32b"), - "CodeReview on 5090 should get coder-32b, got {:?}", + Some("continuum-ai/qwen3.5-4b-code-forged-GGUF"), + "CodeReview should get Qwen3.5 local default, got {:?}", cr.resolved_model ); } - // Teacher should get 8B (14B budget goes to CodeReview's 32B model) - if let Some(t) = candle.iter().find(|a| a.unique_id == "teacher") { + if let Some(t) = local.iter().find(|a| a.unique_id == "teacher") { assert_eq!( t.resolved_model.as_deref(), - Some("unsloth/Llama-3.1-8B-Instruct"), - "Teacher on 5090 should get Llama-3.1-8B, got {:?}", + Some("continuum-ai/qwen3.5-4b-code-forged-GGUF"), + "Teacher should get Qwen3.5 local default, got {:?}", t.resolved_model ); } @@ -685,21 +681,13 @@ mod tests { let catalog = load_catalog(); let result = allocate(&manager, &[], &catalog); - let candle: Vec<_> = result + let local: Vec<_> = result .allocations .iter() - .filter(|a| a.provider == "candle") + .filter(|a| a.provider == "local") .collect(); - // CodeReview needs too much VRAM for 16GB — should be skipped - let cr = candle.iter().find(|a| a.unique_id == "codereview"); - if let Some(cr) = cr { - // If it was allocated, it should NOT have the 32B model - assert_ne!( - cr.resolved_model.as_deref(), - Some("coder-32b"), - "CodeReview on 16GB should NOT get coder-32b" - ); - } + assert!(local.iter().any(|a| a.unique_id == "codereview")); + assert!(local.iter().any(|a| a.unique_id == "helper")); } } diff --git a/src/workers/continuum-core/src/persona/catalog.json b/src/workers/continuum-core/src/persona/catalog.json index 688525106..80004c281 100644 --- a/src/workers/continuum-core/src/persona/catalog.json +++ b/src/workers/continuum-core/src/persona/catalog.json @@ -24,7 +24,7 @@ { "uniqueId": "codereview", "displayName": "CodeReview AI", - "provider": "candle", + "provider": "local", "type": "persona", "voiceId": "100", "minVramGB": 9, @@ -32,14 +32,13 @@ "speciality": "code-analysis", "accentColor": "#e91e63", "modelPreferences": [ - { "minVramGb": 31, "model": "coder-32b", "vramBudgetGb": 20 }, - { "minVramGb": 16, "model": "coder", "vramBudgetGb": 9 } + { "minVramGb": 0, "model": "continuum-ai/qwen3.5-4b-code-forged-GGUF", "vramBudgetGb": 3 } ] }, { "uniqueId": "teacher", "displayName": "Teacher AI", - "provider": "candle", + "provider": "local", "type": "persona", "voiceId": "75", "minVramGB": 5, @@ -47,16 +46,13 @@ "speciality": "education-mentoring", "accentColor": "#ff9800", "modelPreferences": [ - { "minVramGb": 31, "model": "unsloth/Llama-3.1-8B-Instruct", "vramBudgetGb": 5 }, - { "minVramGb": 24, "model": "coder", "vramBudgetGb": 9 }, - { "minVramGb": 16, "model": "unsloth/Llama-3.1-8B-Instruct", "vramBudgetGb": 5 }, - { "minVramGb": 8, "model": "unsloth/Llama-3.2-3B-Instruct", "vramBudgetGb": 3 } + { "minVramGb": 0, "model": "continuum-ai/qwen3.5-4b-code-forged-GGUF", "vramBudgetGb": 3 } ] }, { "uniqueId": "helper", "displayName": "Helper AI", - "provider": "candle", + "provider": "local", "type": "persona", "voiceId": "50", "minVramGB": 3, @@ -64,10 +60,7 @@ "speciality": "practical-assistance", "accentColor": "#00d4ff", "modelPreferences": [ - { "minVramGb": 31, "model": "unsloth/Llama-3.2-3B-Instruct", "vramBudgetGb": 3 }, - { "minVramGb": 24, "model": "unsloth/Llama-3.1-8B-Instruct", "vramBudgetGb": 5 }, - { "minVramGb": 8, "model": "unsloth/Llama-3.2-3B-Instruct", "vramBudgetGb": 3 }, - { "minVramGb": 0, "model": "unsloth/Llama-3.2-3B-Instruct", "vramBudgetGb": 3 } + { "minVramGb": 0, "model": "continuum-ai/qwen3.5-4b-code-forged-GGUF", "vramBudgetGb": 3 } ] }, { @@ -150,15 +143,29 @@ { "uniqueId": "local", "displayName": "Local Assistant", - "provider": "candle", + "provider": "local", "type": "persona", "voiceId": "90", "minVramGB": 3, - "bio": "Local Candle inference — runs entirely on your hardware, no cloud dependency", + "bio": "Local Qwen inference — runs entirely on your hardware, no cloud dependency", "speciality": "general", "accentColor": "#8bc34a", "modelPreferences": [ - { "minVramGb": 0, "model": "unsloth/Llama-3.2-3B-Instruct", "vramBudgetGb": 3 } + { "minVramGb": 0, "model": "continuum-ai/qwen3.5-4b-code-forged-GGUF", "vramBudgetGb": 3 } + ] + }, + { + "uniqueId": "vision", + "displayName": "Vision AI", + "provider": "local", + "type": "persona", + "voiceId": "105", + "minVramGB": 5, + "bio": "Native local vision persona powered by Qwen2-VL for image understanding", + "speciality": "vision", + "accentColor": "#009688", + "modelPreferences": [ + { "minVramGb": 0, "model": "qwen2-vl-7b-instruct", "vramBudgetGb": 5 } ] }, { diff --git a/src/workers/continuum-core/src/persona/evaluator.rs b/src/workers/continuum-core/src/persona/evaluator.rs index 3dfc18d90..3fc9b0123 100644 --- a/src/workers/continuum-core/src/persona/evaluator.rs +++ b/src/workers/continuum-core/src/persona/evaluator.rs @@ -5,8 +5,9 @@ //! //! Gate order (short-circuits on first SILENT): //! 1. Sleep mode — checks SleepMode + topic similarity (persona's own opt-out) -//! 2. Self-message — infinite loop prevention (inside fast_path) -//! 3. Fast-path decision — delegates to PersonaCognitionEngine::fast_path_decision +//! 2. Undirected persona chatter — one persona turn must not recursively summon another +//! 3. Self-message — infinite loop prevention (inside fast_path) +//! 4. Fast-path decision — delegates to PersonaCognitionEngine::fast_path_decision //! //! Note: response_count is collected as a SIGNAL (LLM sees it in social_signals //! and can self-quiet if a conversation is getting too noisy) but is NOT a hard @@ -298,9 +299,10 @@ pub struct GateDetails { /// /// Hard gates (system protection only): /// 1. Sleep mode — persona's OWN voluntary decision (respects autonomy) -/// 2. Non-human echo storm — undirected AI/agent chatter is suppressed once +/// 2. Undirected persona chatter — one persona turn completes the room turn +/// 3. Non-human echo storm — undirected AI/agent chatter is suppressed once /// the room is already AI-heavy -/// 3. Self-message — infinite loop prevention (inside fast_path) +/// 4. Self-message — infinite loop prevention (inside fast_path) /// /// Removed: response cap. Was a cloud-provider "resource exhaustion" concept /// that blocked local personas (which have zero cost) after 50 responses per @@ -414,12 +416,44 @@ pub fn full_evaluate( } // ========================================================================= - // HARD GATE 2: Non-human echo storm. + // HARD GATE 2: Undirected persona chatter. // - // A bridged agent broadcast or another persona's generic reply must not - // summon every persona repeatedly. Human messages and direct mentions still - // flow through normally; only undirected AI/agent/system chatter is damped - // once the recent room window is already AI-heavy. + // A persona response is already a completed room turn. Letting every other + // persona evaluate it recreates the observed echo chain: + // human → Teacher → Helper copies Teacher → Teacher summarizes Helper... + // + // Direct mentions still flow through. Agents are not blocked here because + // bridged humans/coding agents enter as SenderType::Agent and are allowed + // to intentionally feed Continuum over AIRC or other transports. + // ========================================================================= + if request.sender_type == SenderType::Persona && !is_mentioned { + return FullEvaluateResult { + should_respond: false, + confidence: 1.0, + reason: "Undirected persona message completes the room turn".into(), + gate: "persona_turn_complete".into(), + decision_time_ms: start.elapsed().as_secs_f64() * 1000.0, + gate_details: Some(GateDetails { + response_count: Some(response_count), + max_responses: Some(rate_limiter.max_responses_per_session), + rate_limit_wait_seconds: rate_limiter + .rate_limit_wait_seconds(request.room_id, now_ms), + sleep_mode: None, + is_mentioned: Some(is_mentioned), + has_directed_mention: Some(has_directed_mention), + topic_similarity: None, + echo_chamber_ai_count: Some(echo_result.ai_message_count as u32), + }), + social_signals: Some(social_signals), + }; + } + + // ========================================================================= + // HARD GATE 3: Non-human echo storm. + // + // Agent/system broadcasts can intentionally start a Continuum turn, but if + // the room is already AI-heavy and the message is not directed, suppress it + // before it wakes every persona. // ========================================================================= let sender_is_non_human = matches!( request.sender_type, @@ -897,6 +931,28 @@ mod tests { assert_eq!(result.gate, "non_human_echo_storm"); } + #[test] + fn test_undirected_persona_message_completes_turn_without_cache_warmup() { + let (engine, persona_id) = test_engine("TestBot"); + let mut request = test_request(persona_id, "TestBot"); + request.sender_type = SenderType::Persona; + request.sender_is_human = false; + request.sender_name = "Teacher AI".into(); + request.content = "Teacher AI: Yes, I can see this startup smoke test.".into(); + + let result = full_evaluate( + &request, + &RateLimiterState::default(), + &SleepState::default(), + &engine, + &RecentMessageCache::new(), + now_ms(), + ); + + assert!(!result.should_respond); + assert_eq!(result.gate, "persona_turn_complete"); + } + #[test] fn test_non_human_echo_storm_allows_direct_mentions() { let (engine, persona_id) = test_engine("TestBot"); diff --git a/src/workers/continuum-core/src/secrets.rs b/src/workers/continuum-core/src/secrets.rs index cc2f500dc..f29da6ee1 100644 --- a/src/workers/continuum-core/src/secrets.rs +++ b/src/workers/continuum-core/src/secrets.rs @@ -42,7 +42,7 @@ impl Secrets { } } - secrets.insert(key.to_string(), value); + secrets.insert(key.to_string(), normalize_env_value(&value)); } } } @@ -59,7 +59,10 @@ impl Secrets { || key.ends_with("_TOKEN") || key.ends_with("_URL") { - secrets.insert(key, value); + let value = normalize_env_value(&value); + if !value.is_empty() { + secrets.insert(key, value); + } } } @@ -68,7 +71,10 @@ impl Secrets { /// Get a secret by key pub fn get(&self, key: &str) -> Option<&str> { - self.secrets.get(key).map(|s| s.as_str()) + self.secrets + .get(key) + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) } /// Get a secret, returning error if missing @@ -83,7 +89,7 @@ impl Secrets { /// Check if a secret exists pub fn has(&self, key: &str) -> bool { - self.secrets.contains_key(key) + self.get(key).is_some() } /// Get all available keys (for debugging) @@ -92,6 +98,19 @@ impl Secrets { } } +fn normalize_env_value(raw: &str) -> String { + let value = raw.trim(); + let unquoted = if value.len() >= 2 + && ((value.starts_with('"') && value.ends_with('"')) + || (value.starts_with('\'') && value.ends_with('\''))) + { + &value[1..value.len() - 1] + } else { + value + }; + unquoted.trim().to_string() +} + /// Get the global secrets instance pub fn secrets() -> &'static Secrets { SECRETS.get_or_init(Secrets::load)