diff --git a/.github/workflows/carl-install-smoke.yml b/.github/workflows/carl-install-smoke.yml index fc97ab186..27c563935 100644 --- a/.github/workflows/carl-install-smoke.yml +++ b/.github/workflows/carl-install-smoke.yml @@ -45,6 +45,10 @@ on: description: 'Git ref to fetch install.sh from (sha / branch / tag)' required: false default: '' + image_tag: + description: 'Docker image tag to pull (default: canary). Useful values: canary, latest, pr-, .' + required: false + default: 'canary' jobs: carl-install-smoke-amd64: @@ -68,15 +72,46 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + - name: Install mesa-vulkan-drivers (llvmpipe ICD for no-GPU CI runner) + # The default continuum-core-vulkan binary calls Vulkan via the loader. + # On ubuntu-latest there's no GPU hardware → no real ICD → loader returns + # zero devices → binary panics per Joel's "lack of GPU integration is + # forbidden" rule. mesa-vulkan-drivers installs the llvmpipe software + # ICD so the loader returns a (software) device, the binary sees a real + # Vulkan API surface, and the GPU code path is exercised exactly like + # it would be on a hardware-GPU host. vulkan-tools provides vulkaninfo + # for the slice probes (test-slices.sh). + run: | + sudo apt-get update -y + sudo apt-get install -y mesa-vulkan-drivers vulkan-tools + echo "vulkaninfo summary:" + vulkaninfo --summary 2>&1 | head -20 || true + - name: Login to ghcr.io (so install.sh can pull pre-built images) run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - name: Run carl-install smoke env: - # Pass the PR HEAD sha so the smoke fetches the install.sh from - # THIS PR (not main). Falls back to manual workflow_dispatch input - # when not in a PR context. + # PR HEAD sha so smoke fetches install.sh from THIS PR. CARL_INSTALL_REF: ${{ github.event.pull_request.head.sha || inputs.install_ref || github.sha }} + # Pin docker images to :pr-N (PR-scoped, mutable per push). Refreshed + # by push-image.sh on every dev push, so always reflects this PR's + # latest source — but never collides with another PR or canary. + # Slices the dev didn't push directly are aliased from :canary by the + # dev script (manifest copy, no rebuild). :latest was the prior + # default and went 9-14 days stale in April 2026 — never use it for + # smoke. + # + # Resolution priority: PR# > input.image_tag > 'canary'. + # On workflow_dispatch (no PR context) the bare `pr-${{ ... }}` + # interpolated to 'pr-' (empty after dash), causing install.sh to + # miss the registry and fall back to 'will build locally' — which + # then ran a full Rust compile of continuum-core-vulkan on the + # no-GPU runner and hit the 25-min runner cap (observed run + # 25400718464). The conditional below makes manual triggers + # default to the canary tag (the cadence we publish on) and lets + # operators override via the image_tag input from the UI. + CONTINUUM_IMAGE_TAG: ${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || inputs.image_tag || 'canary' }} # 25-min cap on the docker-only install. Hybrid (Mac source-build) # path would exceed this — by design, that's the gate firing on # the README/install mismatch. @@ -91,7 +126,29 @@ jobs: SKIP_TEARDOWN: '0' run: bash scripts/ci/carl-install-smoke.sh - - name: Upload install + page + chat artifacts on failure + - name: Capture docker logs from all containers on failure (continuum-core, + node-server, model-init, widget-server, livekit-bridge) + if: failure() + run: | + # Find the carl-smoke compose project and dump every container's + # logs. Without this we get install.log + page + chat — all OUTSIDE + # the containers — but never see WHY continuum-core / node-server + # didn't reply (silent inference failure was the actual blocker + # 2026-05-04 on PR #1038). Capture per-container so the artifact + # shows the inference path, not just the smoke wrapper output. + set +e + for dir in /tmp/carl-smoke-*; do + [ -d "$dir" ] || continue + [ -f "$dir/docker-compose.yml" ] || continue + for svc in continuum-core node-server model-init widget-server livekit-bridge; do + docker compose -f "$dir/docker-compose.yml" logs --no-color --timestamps "$svc" \ + > "${dir}.${svc}.log" 2>&1 + docker compose -f "$dir/docker-compose.yml" ps "$svc" \ + > "${dir}.${svc}.ps" 2>&1 + done + docker compose -f "$dir/docker-compose.yml" ps -a > "${dir}.compose-ps.log" 2>&1 + done + - name: Upload install + page + chat + docker logs + screenshot artifacts on failure if: failure() uses: actions/upload-artifact@v4 with: @@ -99,6 +156,14 @@ jobs: path: | /tmp/carl-smoke-*.install.log /tmp/carl-smoke-*.page.html + /tmp/carl-smoke-*.page.png /tmp/carl-smoke-*.chat.log + /tmp/carl-smoke-*.continuum-core.log + /tmp/carl-smoke-*.node-server.log + /tmp/carl-smoke-*.model-init.log + /tmp/carl-smoke-*.widget-server.log + /tmp/carl-smoke-*.livekit-bridge.log + /tmp/carl-smoke-*.compose-ps.log + /tmp/carl-smoke-*.*.ps retention-days: 7 if-no-files-found: ignore diff --git a/docker-compose.yml b/docker-compose.yml index 2a4a99085..9eb0ea4be 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -67,18 +67,31 @@ services: - WHISPER_MODEL=${WHISPER_MODEL:-base} # ── Continuum Core (Rust) ───────────────────────────────── + # Default uses the vulkan variant: software rendering via mesa's llvmpipe ICD + # when no GPU hardware is present, real driver ICD (NVIDIA/Intel/AMD) when one + # is. Joel's 2026-04-23 architectural rule: "lack of GPU integration is + # forbidden". The previous CPU-only 'core' variant violated that by panicking + # on no-GPU per gpu/memory_manager.rs:757. Vulkan-with-llvmpipe satisfies the + # rule (binary exercises the GPU API loader; llvmpipe answers the queries via + # software rasterizer). Removed in #1038 (Task #98) — see + # docs/INSTALL-ARCHITECTURE.md. + # + # CUDA hosts overlay docker-compose.gpu.yml to swap in continuum-core-cuda for + # NVIDIA-accelerated inference. Mac runs continuum-core natively (overlay + # docker-compose.mac.yml sets replicas:0 here). continuum-core: build: context: ./src/workers - dockerfile: ../../docker/continuum-core.Dockerfile + dockerfile: ../../docker/continuum-core-vulkan.Dockerfile additional_contexts: avatars: ./src/models/avatars shared-generated: ./src/shared/generated args: # --no-default-features excludes livekit-webrtc (handled by livekit-bridge). # load-dynamic-ort loads ONNX Runtime as shared lib (runtime discovery). - GPU_FEATURES: "--no-default-features --features load-dynamic-ort" - image: ghcr.io/cambriantech/continuum-core:${CONTINUUM_IMAGE_TAG:-latest} + # vulkan feature wires through to llama.cpp's GGML_VULKAN backend. + GPU_FEATURES: "--no-default-features --features load-dynamic-ort,vulkan" + image: ghcr.io/cambriantech/continuum-core-vulkan:${CONTINUUM_IMAGE_TAG:-latest} restart: unless-stopped # Sized for mission: Qwen 4-8B Q4 + KV cache for 5 personas + embeddings # + Bevy render + vision + audio. Auto-calculated by install.sh from host @@ -199,7 +212,8 @@ services: restart: unless-stopped mem_limit: 512m depends_on: - - node-server + node-server: + condition: service_healthy ports: - "9003:9003" # HTTP volumes: diff --git a/docker/model-init.Dockerfile b/docker/model-init.Dockerfile index 345a690fa..0586fce23 100644 --- a/docker/model-init.Dockerfile +++ b/docker/model-init.Dockerfile @@ -12,24 +12,30 @@ FROM node:20-slim LABEL org.opencontainers.image.source=https://github.com/CambrianTech/continuum RUN apt-get update && apt-get install -y --no-install-recommends \ - curl unzip bash ca-certificates \ + curl unzip bash ca-certificates jq \ && rm -rf /var/lib/apt/lists/* WORKDIR /app -# Copy download scripts and their shared dependencies -COPY scripts/download-voice-models.sh scripts/download-voice-models.sh +# Single source of truth for ALL models the system uses (chat / vision / +# embedding / STT / TTS / VAD). Per Joel 2026-05-04: +# "we MUST have this work from ONE source of truth" +COPY shared/models.json shared/models.json +COPY scripts/download-models.sh scripts/download-models.sh +# Avatar download (VRM files) — distinct from ML models, kept separate for now. COPY scripts/download-avatar-models.sh scripts/download-avatar-models.sh COPY scripts/generate-scene-models.ts scripts/generate-scene-models.ts COPY scripts/shared/ scripts/shared/ COPY package.json package.json -RUN chmod +x scripts/download-voice-models.sh scripts/download-avatar-models.sh +RUN chmod +x scripts/download-models.sh scripts/download-avatar-models.sh -# MODELS_DIR is set by docker-compose.yml to /models (the volume mount) ENV MODELS_DIR=/models - -# Download voice models (whisper, piper, kokoro, orpheus, vad) -# then avatar models (VRM files) -# Scene generation requires tsx — skip in init, handled by npm start -CMD bash scripts/download-voice-models.sh && bash scripts/download-avatar-models.sh +ENV REGISTRY=/app/shared/models.json + +# Download all models from src/shared/models.json (chat-LLM tier-default, +# embeddings, STT, TTS, VAD) then avatar models. Per Joel 2026-05-04: +# "all the models must download and run on GPU" — no DMR dependency. +# continuum-core loads chat LLMs via its built-in llama.cpp + host GPU +# (Metal / CUDA / Vulkan ICD). +CMD bash scripts/download-models.sh && bash scripts/download-avatar-models.sh diff --git a/docker/node-server.Dockerfile b/docker/node-server.Dockerfile index e780203a4..a4e98a30b 100644 --- a/docker/node-server.Dockerfile +++ b/docker/node-server.Dockerfile @@ -27,6 +27,6 @@ VOLUME ["/root/.continuum"] EXPOSE 9000 9001 HEALTHCHECK --interval=10s --timeout=5s --start-period=30s --retries=3 \ - CMD node -e "const s=require('net').connect(9001,'localhost',()=>{s.end();process.exit(0)});s.on('error',()=>process.exit(1))" + CMD test -f /root/.continuum/run/node-server.ready && node -e "const s=require('net').connect(9001,'localhost',()=>{s.end();process.exit(0)});s.on('error',()=>process.exit(1))" CMD ["npx", "tsx", "server/docker-entrypoint.ts"] diff --git a/install.sh b/install.sh index 31fd7a0d2..4e1e3199d 100644 --- a/install.sh +++ b/install.sh @@ -425,12 +425,14 @@ EOF esac case "$IC_GPU_PATH" in dmr-*) - if ! docker model ls 2>/dev/null | grep -q "qwen3.5-4b-code-forged"; then - info "Pulling default persona model into Docker Model Runner (~2.7GB, first install only)..." - docker model pull "$PERSONA_MODEL" || warn "Model pull failed — chat will error until model is available. Retry: docker model pull $PERSONA_MODEL" - else - ok "Persona model already in DMR: $PERSONA_MODEL" - fi + # Per Joel 2026-05-04: "all the models must download and run on GPU" + # + "we MUST have this work from ONE source of truth". DMR's + # `docker model pull` was the Mac-only path that didn't work on + # Linux. Models now download via the model-init container reading + # src/shared/models.json — same path on Mac/Linux/Windows. The DMR + # branch here remains for KV-cache-config + vLLM-MLX install (which + # are still useful tuning), but no longer pulls the model. + ok "Persona model download deferred to model-init container (reads src/shared/models.json)" # Cap llama-server's per-slot KV cache reservation, sized to actual # physical RAM. Without this cap each slot reserves the full model # context (262144 tokens for Qwen3.5), ballooning @@ -483,11 +485,10 @@ EOF # Pull MLX-format Qwen3.5-4B for vllm-metal routing. # DMR auto-routes MLX models to vllm-metal when installed. MLX_MODEL="hf.co/mlx-community/Qwen3.5-4B-MLX-4bit" - if ! docker model ls 2>/dev/null | grep -q "Qwen3.5-4B-MLX"; then - info "Pulling MLX-format Qwen3.5-4B (~2.5GB, for 3x faster inference)..." - docker model pull "$MLX_MODEL" \ - || warn "MLX model pull failed. GGUF via llama.cpp will be used instead." - fi + # MLX-format model also moves to registry-driven download. + # Add MLX entry to src/shared/models.json + auto_download.always + # if/when we want vllm-metal to find it on disk. + ok "MLX model download deferred to model-init (add to src/shared/models.json to enable)" else warn "vLLM install failed (requires Docker Desktop 4.62+). llama.cpp Metal will be used." fi @@ -887,10 +888,25 @@ elif [[ "$HAS_GPU" == "true" ]]; then if [ -f "docker-compose.gpu.yml" ]; then COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu.yml" else - warn "docker-compose.gpu.yml missing — GPU detected but cuda override won't apply. Continuing on CPU images." + warn "docker-compose.gpu.yml missing — GPU detected but cuda override won't apply. Continuing on Vulkan base image (still GPU-API; will use llvmpipe ICD if no vulkan driver)." fi COMPOSE_ARGS="--profile gpu" fi +# Linux without a CUDA GPU: base docker-compose.yml uses continuum-core-vulkan. +# On real-driver hosts (Intel/AMD with vulkan) this picks up the hardware ICD; +# on hosts without a driver, mesa-vulkan-drivers (apt) provides llvmpipe as a +# software ICD so the Vulkan code path runs without panicking. Joel's +# 2026-04-23 rule: GPU integration is forbidden to fall back. Vulkan-via- +# llvmpipe is GPU integration (loader + ICD), not a CPU fallback. +if [[ "$OS" == "Linux" ]] && [[ "$HAS_GPU" != "true" ]]; then + if ! command -v vulkaninfo >/dev/null 2>&1; then + warn "vulkaninfo not found — install mesa-vulkan-drivers vulkan-tools so the Vulkan loader has the llvmpipe software ICD: sudo apt-get install -y mesa-vulkan-drivers vulkan-tools" + elif ! vulkaninfo --summary 2>/dev/null | grep -qE "deviceName"; then + warn "Vulkan loader present but enumerated zero devices. continuum-core-vulkan will panic on startup. Install: sudo apt-get install -y mesa-vulkan-drivers" + else + info "Vulkan loader OK — will use $(vulkaninfo --summary 2>/dev/null | grep -E 'deviceName' | head -1 | sed 's/.*= *//')" + fi +fi # ── 7. Pull support-service images ───────────────────────── PHASE="pull images" @@ -1044,6 +1060,38 @@ for i in $(seq 1 "$HEALTH_TIMEOUT_SEC"); do sleep 1 done +# ── 8c. Wait for node-server seed to populate the default room ────── +# widget-server /health on port 9003 only proves that container is up. +# node-server (port 9001) runs auto-seed in docker-entrypoint.ts which +# creates the "general" room + personas. If the user opens the page or +# chat probe runs BEFORE seed completes, chat/send returns "Room not +# found: general" or "User not found" silently. Probe directly for the +# general room via jtag — fast, no new endpoint needed, deterministic. +# Caught by carl-install-smoke 2026-05-04 (PR #1038). +SEED_TIMEOUT_SEC="${SEED_TIMEOUT_SEC:-60}" +JTAG_BIN="$(command -v jtag 2>/dev/null || true)" +[ -z "$JTAG_BIN" ] && JTAG_BIN="$INSTALL_DIR/src/jtag" +if [ -x "$JTAG_BIN" ] && [ "$HEALTH_OK" -eq 1 ]; then + info "Waiting for seed to populate default room (timeout ${SEED_TIMEOUT_SEC}s)..." + SEED_OK=0 + for i in $(seq 1 "$SEED_TIMEOUT_SEC"); do + # data/list returns success+items when the room exists. Empty items + # means seed hasn't created it yet. + if "$JTAG_BIN" data/list --collection=rooms --filter='{"uniqueId":"general"}' --limit=1 2>/dev/null \ + | grep -q '"success":true.*"items":\[{'; then + SEED_OK=1 + ok "default room seeded after ${i}s" + break + fi + sleep 1 + done + if [ "$SEED_OK" -ne 1 ]; then + warn "general room not present after ${SEED_TIMEOUT_SEC}s — seed may have failed." + warn " Chat will return 'Room not found' until seed completes." + warn " Diagnose: $CONTAINER_CMD compose -f $INSTALL_DIR/docker-compose.yml logs node-server | tail -50" + fi +fi + # ── 9. Determine URL + open browser (only if healthy) ────── PHASE="open browser" if [ -n "$TS_HOSTNAME" ] && [ -f "$CONTINUUM_DATA/$TS_HOSTNAME.crt" ]; then diff --git a/scripts/ci/carl-install-smoke.sh b/scripts/ci/carl-install-smoke.sh index 7003ba72e..8a59d1074 100644 --- a/scripts/ci/carl-install-smoke.sh +++ b/scripts/ci/carl-install-smoke.sh @@ -48,6 +48,19 @@ echo "━━━━━━━━━━━━━━━━━━━━━━━━ teardown() { local rc=$? + # Capture per-container docker logs BEFORE `docker compose down` kills + # the containers and makes their logs unrecoverable. Without this the + # workflow's `if: failure()` step fires after smoke exit when containers + # are already gone — exactly the silent-evidence-loss the per-container + # logs are supposed to prevent. Capture on every exit (success or + # failure) since the file glob in the workflow upload is failure-only. + if [ -d "$CARL_INSTALL_DIR" ] && [ -f "$CARL_INSTALL_DIR/docker-compose.yml" ]; then + for svc in continuum-core node-server model-init widget-server livekit-bridge; do + ( cd "$CARL_INSTALL_DIR" && docker compose logs --no-color --timestamps "$svc" \ + > "${CARL_INSTALL_DIR}.${svc}.log" 2>&1 ) || true + done + ( cd "$CARL_INSTALL_DIR" && docker compose ps -a > "${CARL_INSTALL_DIR}.compose-ps.log" 2>&1 ) || true + fi if [ "$SKIP_TEARDOWN" != "1" ] && [ -d "$CARL_INSTALL_DIR" ]; then echo "" echo "━━━ tearing down $CARL_INSTALL_DIR ━━━" @@ -167,6 +180,33 @@ done echo "✅ root page looks like real HTML (${ROOT_BYTES} bytes, no failure markers)" +# ── 3b. Headless screenshot — what Carl ACTUALLY sees in the browser ── +# curl gives the server-rendered HTML shell. The chat UI itself loads via +# JS — could be a blank chat with no personas or an empty room and curl +# wouldn't catch it. Use chromium headless to capture what a real browser +# renders. Wait a few seconds for the JS to populate tabs, personas, +# rooms before snapping. Continue on screenshot failure (chrome may not +# be on the PATH for non-CI runs); this is diagnostic, not gating. +PAGE_PNG="${CARL_INSTALL_DIR}.page.png" +CHROME_BIN="$(command -v google-chrome || command -v chromium || command -v chromium-browser || true)" +if [ -n "$CHROME_BIN" ]; then + echo "" + echo "━━━ headless screenshot via $CHROME_BIN (waits 8s for JS to render) ━━━" + sleep 8 + "$CHROME_BIN" --headless --disable-gpu --no-sandbox --hide-scrollbars \ + --window-size=1280,1024 \ + --screenshot="$PAGE_PNG" \ + --virtual-time-budget=8000 \ + "http://localhost:9003/" >/dev/null 2>&1 || true + if [ -f "$PAGE_PNG" ]; then + echo " ✓ screenshot saved: $PAGE_PNG ($(stat -c%s "$PAGE_PNG" 2>/dev/null || stat -f%z "$PAGE_PNG") bytes)" + else + echo " ⚠ screenshot capture failed (non-fatal)" + fi +else + echo " ⚠ no chromium/chrome on PATH — skipping browser screenshot" +fi + # ── 4. End-to-end chat: Carl types a message, expects an AI reply ───── # Per Joel's "OOTB on MacBook Air, free, accessible" + "canary e2e # working from curl, Carl's case" — page-render is necessary but not diff --git a/scripts/test-slices.sh b/scripts/test-slices.sh index 8ee928e5d..9be1ce234 100755 --- a/scripts/test-slices.sh +++ b/scripts/test-slices.sh @@ -219,6 +219,54 @@ else else fail "vulkan-runtime-linked" "continuum-core-server does not link libvulkan — feature flag didn't propagate?" fi + # Slice 3: continuum-core RUNTIME actually USED Vulkan (not just linked + # it). On boot, GpuMemoryManager logs "GPU detected: MB VRAM" + # via log_info!("gpu", "manager", ...). If we don't see that line, the + # binary either skipped GPU detection (feature flag broken) or panicked + # silently before the log fired. Either way, image isn't shippable. + # 30s window covers normal boot + GpuMemoryManager init. + VK_BOOT_SEEN=false + for _ in $(seq 1 30); do + if docker logs "$CID" 2>&1 | grep -qE "GPU detected: .* — [0-9]+MB VRAM"; then + VK_BOOT_SEEN=true + break + fi + sleep 1 + done + if $VK_BOOT_SEEN; then + VK_DEV=$(docker logs "$CID" 2>&1 | grep -oE "GPU detected: [^—]+ — [0-9]+MB VRAM" | head -1) + pass "vulkan-runtime-used-by-core ($VK_DEV)" + else + fail "vulkan-runtime-used-by-core" "continuum-core never logged GPU detection within 30s — binary linked libvulkan but didn't enumerate devices through it" + echo " recent core logs:" >&2 + docker logs --tail 20 "$CID" 2>&1 | sed 's/^/ /' >&2 + fi + # Slice 4: continuum-core IPC reports the GPU it actually picked. + # gpu/stats returns the manager's view: total_vram_mb + per-subsystem + # budgets. If totals are 0 or the call errors, the runtime contract is + # broken even though boot logged a device. Probe via netcat over the + # bind-mounted unix socket — minimal IPC handshake, no python/node deps. + GPU_STATS=$(docker exec "$CID" sh -c ' + SOCK=/root/.continuum/sockets/continuum-core.sock + [ -S "$SOCK" ] || exit 1 + printf "%s" "{\"command\":\"gpu/stats\",\"params\":null}" | nc -U -w 5 "$SOCK" 2>/dev/null + ' 2>&1 || true) + if echo "$GPU_STATS" | grep -qE '"total_vram_mb"\s*:\s*[1-9]'; then + VRAM=$(echo "$GPU_STATS" | grep -oE '"total_vram_mb"\s*:\s*[0-9]+' | grep -oE '[0-9]+$') + pass "vulkan-ipc-reports-gpu (${VRAM}MB)" + elif echo "$GPU_STATS" | grep -q '"total_vram_mb"'; then + fail "vulkan-ipc-reports-gpu" "gpu/stats returned 0 total_vram_mb — manager initialized but didn't claim memory" + else + # nc may not be in the runtime image — skip with a note rather than + # fail, since slice 3 above already proves runtime use via boot logs. + # Image rebuild can add netcat to bring this probe online. + if ! docker exec "$CID" which nc >/dev/null 2>&1; then + echo " - vulkan-ipc-reports-gpu skipped: nc not in runtime image (boot-log slice covers runtime-use)" >&2 + else + fail "vulkan-ipc-reports-gpu" "gpu/stats IPC didn't return expected shape" + echo " raw response: $(echo "$GPU_STATS" | head -5)" >&2 + fi + fi ;; core) # CPU-only variant — just sanity that OpenMP runtime is present diff --git a/src/daemons/ai-provider-daemon/adapters/candle/shared/CandleAdapter.ts b/src/daemons/ai-provider-daemon/adapters/candle/shared/CandleAdapter.ts index 22d2d8a35..6e30cc976 100644 --- a/src/daemons/ai-provider-daemon/adapters/candle/shared/CandleAdapter.ts +++ b/src/daemons/ai-provider-daemon/adapters/candle/shared/CandleAdapter.ts @@ -25,8 +25,14 @@ import type { } from '../../../shared/AIProviderTypesV2'; import { InferenceGrpcClient } from '../../../../../system/core/services/InferenceGrpcClient'; import { LOCAL_MODELS } from '../../../../../system/shared/Constants'; +import { + resolveModel as registryResolveModel, + tierFromRamGB, + type Tier, +} from '../../../../../shared/ModelRegistry'; import { existsSync } from 'fs'; import { resolve } from 'path'; +import { totalmem } from 'os'; // ============================================================================ // Types @@ -83,6 +89,7 @@ export class CandleAdapter extends BaseAIProviderAdapter { private loadedModels: Set = new Set(); private loadedAdapters: Map = new Map(); // modelId -> adapters private maxInputTokens: number; + private hostTier: Tier; constructor(config: CandleAdapterConfig = {}) { super(); @@ -90,6 +97,11 @@ export class CandleAdapter extends BaseAIProviderAdapter { // Use gRPC client (replaces Unix socket) this.client = InferenceGrpcClient.sharedInstance(); + // Tier is fixed at process start — RAM doesn't change, and resolving + // the same symbolic ref to different models mid-process would defeat + // the gRPC server's preload contract. + this.hostTier = tierFromRamGB(Math.round(totalmem() / 1024 / 1024 / 1024)); + this.defaultModel = config.defaultModel || LOCAL_MODELS.DEFAULT; this.baseTimeout = config.timeout || 180000; // 180s to handle model download + generation // Q8_0 quantized model can handle ~1500 tokens input reliably @@ -100,6 +112,32 @@ export class CandleAdapter extends BaseAIProviderAdapter { // Note: Model is pre-loaded by gRPC server at startup } + /** + * Resolve a model identifier to a concrete HuggingFace ID. + * + * Handles three input shapes (in order): + * 1. Symbolic ref ('local-default', 'vision-default', 'gating') → + * ModelRegistry resolves via src/shared/models.json (current registry). + * 2. Registry key ('qwen3.5-4b-code-forged', 'qwen2-vl-7b') → + * ModelRegistry returns concrete hf_repo. + * 3. Legacy short name ('llama3.2:3b') OR raw HF ID → + * LOCAL_MODELS.mapToHuggingFace fallback. + * + * This is the boundary that lets persona DB rows store stable symbolic + * refs while every request still resolves to whatever the registry + * declares "current" — no DB migration when we swap underlying models. + */ + private resolveModelId(requestedModel: string): string { + try { + const spec = registryResolveModel(requestedModel, this.hostTier); + return spec.hf_repo; + } catch { + // Not in registry — fall through to legacy mapping (which assumes + // raw HF ID if no match). + return LOCAL_MODELS.mapToHuggingFace(requestedModel); + } + } + // Note: Model is pre-loaded by gRPC server at startup, not by TypeScript // ============================================================================ @@ -114,13 +152,18 @@ export class CandleAdapter extends BaseAIProviderAdapter { this.log(request, 'info', `🔧 TRACE-1: generateTextImpl START (requestId=${requestId.slice(0,8)})`); - // Determine model to use - map legacy names to HuggingFace via central config + // Determine model to use. Accepts symbolic refs ('local-default', + // 'vision-default', 'gating'), registry keys ('qwen3.5-4b-code-forged'), + // legacy short names ('llama3.2:3b'), or raw HF IDs. ModelRegistry is + // the source of truth — DB rows storing symbolic refs auto-pick-up + // registry edits without migration. Joel rule 2026-05-04: + // "we MUST have this work from ONE source of truth". const requestedModel = request.model || this.defaultModel; - const modelId = LOCAL_MODELS.mapToHuggingFace(requestedModel); + const modelId = this.resolveModelId(requestedModel); // Log mapping if different if (modelId !== requestedModel) { - this.log(request, 'info', `Model mapped: ${requestedModel} → ${modelId}`); + this.log(request, 'info', `Model resolved: ${requestedModel} → ${modelId} (tier=${this.hostTier})`); } // Model is pre-loaded by gRPC server at startup @@ -344,7 +387,7 @@ export class CandleAdapter extends BaseAIProviderAdapter { adapterName: string; applyImmediately?: boolean; }): Promise { - const modelId = LOCAL_MODELS.mapToHuggingFace(skillImplementation.modelId); + const modelId = this.resolveModelId(skillImplementation.modelId); const { adapterName, adapterPath } = skillImplementation; this.log(null, 'info', `🧬 applySkill: Loading adapter "${adapterName}" from ${adapterPath}`); @@ -592,7 +635,7 @@ export class CandleAdapter extends BaseAIProviderAdapter { * STUBBED: gRPC server preloads model at startup */ async preloadModel(requestedModelId: string): Promise { - const modelId = LOCAL_MODELS.mapToHuggingFace(requestedModelId); + const modelId = this.resolveModelId(requestedModelId); this.log(null, 'info', `preloadModel: Model ${modelId} is preloaded by gRPC server`); this.loadedModels.add(modelId); } diff --git a/src/scripts/build-with-loud-failure.ts b/src/scripts/build-with-loud-failure.ts index 20a375bb4..e12a8893d 100644 --- a/src/scripts/build-with-loud-failure.ts +++ b/src/scripts/build-with-loud-failure.ts @@ -6,6 +6,8 @@ */ import { execSync } from 'child_process'; +import { copyFileSync, mkdirSync, existsSync } from 'fs'; +import { dirname } from 'path'; console.log('🔨 Building TypeScript with strict error checking...\n'); @@ -16,6 +18,19 @@ try { encoding: 'utf-8' }); + // Copy non-TS runtime assets that ModelRegistry / scripts read by path. + // tsc doesn't copy JSON — anything that ships next to .ts and is read + // at runtime via __dirname must be replicated into dist/. + const assets: Array<[string, string]> = [ + ['shared/models.json', 'dist/shared/models.json'], + ]; + for (const [src, dest] of assets) { + if (!existsSync(src)) continue; // Optional asset — skip if absent. + mkdirSync(dirname(dest), { recursive: true }); + copyFileSync(src, dest); + console.log(`📦 Copied asset: ${src} → ${dest}`); + } + console.log('\n✅ TypeScript compilation succeeded'); process.exit(0); diff --git a/src/scripts/download-models.sh b/src/scripts/download-models.sh new file mode 100755 index 000000000..53d343dba --- /dev/null +++ b/src/scripts/download-models.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# download-models.sh — Reads src/shared/models.json and downloads every +# model listed in `auto_download.always` plus the tier-specific set. Runs +# in the model-init container. +# +# Replaces the previous Mac-only `docker model pull` flow + the hardcoded +# URL list in download-voice-models.sh. ONE source of truth (models.json) +# means swapping a model is a single edit there — this script and all +# other consumers pick it up automatically. +# +# Per Joel's rule (2026-05-04): "all the models must download and run on +# GPU" — no DMR dependency. Continuum-core loads everything via its +# built-in llama.cpp via the host GPU (Metal / CUDA / Vulkan ICD). +# +# Env: +# MODELS_DIR=/models (the volume mount; default /models) +# TIER=full (mba | mid | full; defaults to full if RAM ≥ 32GB) +# REGISTRY=/app/shared/models.json (path to registry inside container) + +set -euo pipefail + +MODELS_DIR="${MODELS_DIR:-/models}" +REGISTRY="${REGISTRY:-/app/shared/models.json}" + +# Auto-detect tier from total RAM if not set. Mirrors install.sh tier +# logic + ModelRegistry.tierFromRamGB() — keep consistent. +if [[ -z "${TIER:-}" ]]; then + if [[ -f /proc/meminfo ]]; then + RAM_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}') + RAM_GB=$((RAM_KB / 1024 / 1024)) + else + RAM_GB=32 # fallback assume full tier + fi + if [[ "$RAM_GB" -ge 32 ]]; then TIER=full + elif [[ "$RAM_GB" -ge 24 ]]; then TIER=mid + else TIER=mba + fi +fi + +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +RED='\033[0;31m' +NC='\033[0m' + +mkdir -p "$MODELS_DIR" + +echo -e "${YELLOW}━━━ download-models.sh — registry-driven model download ━━━${NC}" +echo " REGISTRY: $REGISTRY" +echo " MODELS_DIR: $MODELS_DIR" +echo " TIER: $TIER" +echo "" + +if [[ ! -f "$REGISTRY" ]]; then + echo -e "${RED}ERROR: registry file $REGISTRY not found in container.${NC}" >&2 + echo " Check model-init.Dockerfile COPY of src/shared/models.json." >&2 + exit 1 +fi + +if ! command -v jq >/dev/null 2>&1; then + echo -e "${RED}ERROR: jq not installed in this image.${NC}" >&2 + echo " Add 'jq' to the apt-get line in model-init.Dockerfile." >&2 + exit 1 +fi + +# Compute the download set: always[] + by_tier[$TIER][] +mapfile -t MODEL_KEYS < <(jq -r --arg tier "$TIER" ' + [ + .auto_download.always[], + (.auto_download.by_tier[$tier] // [])[] + ] | unique | .[] +' "$REGISTRY") + +echo -e "${YELLOW}Models to download (${#MODEL_KEYS[@]}): ${MODEL_KEYS[*]}${NC}" +echo "" + +# Download via huggingface direct-URL pattern: each model has files[]. +# We resolve to https://huggingface.co//resolve/main/ and curl. +# The huggingface-cli would be cleaner but adds Python+pip to model-init +# (currently a tiny node:slim image, ~120MB). Direct curl keeps it lean. +for KEY in "${MODEL_KEYS[@]}"; do + KIND=$(jq -r --arg k "$KEY" '.models[$k].kind // "unknown"' "$REGISTRY") + REPO=$(jq -r --arg k "$KEY" '.models[$k].hf_repo // ""' "$REGISTRY") + FORMAT=$(jq -r --arg k "$KEY" '.models[$k].format // ""' "$REGISTRY") + SIZE=$(jq -r --arg k "$KEY" '.models[$k].size_gb // "?"' "$REGISTRY") + + if [[ -z "$REPO" ]]; then + echo -e "${YELLOW} SKIP $KEY — no hf_repo in registry${NC}" + continue + fi + # Skip candle-builtin formats (continuum-core loads from rust-bert / candle direct) + if [[ "$FORMAT" == "candle-builtin" ]]; then + echo -e "${GREEN} SKIP $KEY — format=candle-builtin (loaded in-process by continuum-core)${NC}" + continue + fi + + TARGET_DIR="$MODELS_DIR/$KEY" + mkdir -p "$TARGET_DIR" + + # Get files list. Some entries omit files (huggingface-cli style); skip those. + mapfile -t FILES < <(jq -r --arg k "$KEY" '.models[$k].files // [] | .[]' "$REGISTRY") + if [[ ${#FILES[@]} -eq 0 ]]; then + echo -e "${YELLOW} SKIP $KEY — no files[] specified (huggingface-cli pull required)${NC}" + continue + fi + + echo -e "${YELLOW}━━ $KEY (kind=$KIND, ~${SIZE}GB) ━━${NC}" + for FILE in "${FILES[@]}"; do + DEST="$TARGET_DIR/$(basename "$FILE")" + if [[ -f "$DEST" ]]; then + echo -e "${GREEN} ✓ already cached: $(basename "$FILE")${NC}" + continue + fi + URL="https://huggingface.co/${REPO}/resolve/main/${FILE}" + echo " ↓ $URL" + if curl -fsSL --retry 3 --retry-delay 2 -o "$DEST.partial" "$URL"; then + mv "$DEST.partial" "$DEST" + echo -e "${GREEN} ✓ $(basename "$FILE") ($(du -h "$DEST" | cut -f1))${NC}" + else + rm -f "$DEST.partial" + echo -e "${RED} ✗ FAILED to download $FILE${NC}" >&2 + # Continue rather than fail-the-container — partial models is better + # than no models. continuum-core will report missing-file at load time. + fi + done +done + +echo "" +echo -e "${GREEN}━━ download-models.sh complete (TIER=$TIER) ━━${NC}" +echo " Total in $MODELS_DIR: $(du -sh "$MODELS_DIR" 2>/dev/null | cut -f1)" diff --git a/src/scripts/seed/personas.ts b/src/scripts/seed/personas.ts index f9a28a49c..f0dcd047a 100644 --- a/src/scripts/seed/personas.ts +++ b/src/scripts/seed/personas.ts @@ -16,6 +16,7 @@ import { generateUniqueId } from '../../system/data/utils/UniqueIdUtils'; import { LOCAL_MODELS } from '../../system/shared/Constants'; +import { SYMBOLIC_REFS } from '../../shared/ModelRegistry'; import { execSync } from 'child_process'; export interface PersonaConfig { @@ -24,7 +25,15 @@ export interface PersonaConfig { provider?: string; type: 'agent' | 'persona'; voiceId?: string; // TTS speaker ID (0-246 for LibriTTS multi-speaker model) - modelId?: string; // AI model ID (e.g., 'qwen3-omni-flash-realtime' for audio-native) + modelId?: string; // Concrete AI model ID — LEGACY/cached. Prefer modelRef. + modelRef?: string; // Symbolic ref into src/shared/models.json + // ('local-default', 'vision-default', 'gating'). Resolved + // at request time by ModelRegistry → current registry + // value picks up automatically when models.json changes. + // Per Joel 2026-05-04: "update the existing seeded values + // so the personas PICK UP THE MODEL change and arent + // stuck in the past." Symbolic refs eliminate stale-DB + // drift entirely. isAudioNative?: boolean; // True if model supports direct audio I/O (no STT/TTS needed) apiKeyEnv?: string; // Environment variable name for the API key (e.g., 'ANTHROPIC_API_KEY') minVramGB?: number; // Minimum VRAM in GB for local inference (candle provider) @@ -56,9 +65,9 @@ export const PERSONA_CONFIGS: PersonaConfig[] = [ // error if neither is available. Never silent Candle-CPU fallback. // 4B GGUF is the universal default — fits every supported machine, fast // on Metal/Vulkan/CUDA. Power users upgrade to 27B manually (HF-gated). - { uniqueId: generateUniqueId('Helper'), displayName: 'Helper AI', provider: 'local', type: 'persona', voiceId: '50', minVramGB: 3, modelId: LOCAL_MODELS.DEFAULT }, - { uniqueId: generateUniqueId('Teacher'), displayName: 'Teacher AI', provider: 'local', type: 'persona', voiceId: '75', minVramGB: 5, modelId: LOCAL_MODELS.DEFAULT }, - { uniqueId: generateUniqueId('CodeReview'), displayName: 'CodeReview AI', provider: 'local', type: 'persona', voiceId: '100', minVramGB: 5, modelId: LOCAL_MODELS.DEFAULT }, + { uniqueId: generateUniqueId('Helper'), displayName: 'Helper AI', provider: 'local', type: 'persona', voiceId: '50', minVramGB: 3, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, + { uniqueId: generateUniqueId('Teacher'), displayName: 'Teacher AI', provider: 'local', type: 'persona', voiceId: '75', minVramGB: 5, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, + { uniqueId: generateUniqueId('CodeReview'), displayName: 'CodeReview AI', provider: 'local', type: 'persona', voiceId: '100', minVramGB: 5, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, // Cloud provider personas (each needs its own API key) { uniqueId: generateUniqueId('DeepSeek'), displayName: 'DeepSeek Assistant', provider: 'deepseek', type: 'persona', voiceId: '125', apiKeyEnv: 'DEEPSEEK_API_KEY' }, @@ -68,7 +77,7 @@ export const PERSONA_CONFIGS: PersonaConfig[] = [ { uniqueId: generateUniqueId('Grok'), displayName: 'Grok', provider: 'xai', type: 'persona', voiceId: '220', apiKeyEnv: 'XAI_API_KEY' }, { uniqueId: generateUniqueId('Together'), displayName: 'Together Assistant', provider: 'together', type: 'persona', voiceId: '30', apiKeyEnv: 'TOGETHER_API_KEY' }, { uniqueId: generateUniqueId('Fireworks'), displayName: 'Fireworks AI', provider: 'fireworks', type: 'persona', voiceId: '60', apiKeyEnv: 'FIREWORKS_API_KEY' }, - { uniqueId: generateUniqueId('Local'), displayName: 'Local Assistant', provider: 'local', type: 'persona', voiceId: '90', minVramGB: 4, modelId: LOCAL_MODELS.DEFAULT }, + { uniqueId: generateUniqueId('Local'), displayName: 'Local Assistant', provider: 'local', type: 'persona', voiceId: '90', minVramGB: 4, modelRef: SYMBOLIC_REFS.LOCAL_DEFAULT }, { uniqueId: generateUniqueId('Sentinel'), displayName: 'Sentinel', provider: 'sentinel', type: 'persona', voiceId: '240' }, { uniqueId: generateUniqueId('Gemini'), displayName: 'Gemini', provider: 'google', type: 'persona', voiceId: '115', apiKeyEnv: 'GOOGLE_API_KEY' }, @@ -91,7 +100,7 @@ export const PERSONA_CONFIGS: PersonaConfig[] = [ type: 'persona', voiceId: '105', minVramGB: 5, - modelId: LOCAL_MODELS.VISION, + modelRef: SYMBOLIC_REFS.VISION_DEFAULT, }, // Audio AI persona is intentionally NOT seeded yet. The Qwen2-Audio-7B diff --git a/src/server/docker-entrypoint.ts b/src/server/docker-entrypoint.ts index 31ad70b1f..eab9ac40c 100644 --- a/src/server/docker-entrypoint.ts +++ b/src/server/docker-entrypoint.ts @@ -10,12 +10,17 @@ import { systemOrchestrator } from '../system/orchestration/SystemOrchestrator'; import { getActiveExampleName } from '../examples/server/ExampleConfigServer'; +import { mkdir, rm, writeFile } from 'fs/promises'; +import { dirname } from 'path'; + +const READINESS_FILE = process.env.CONTINUUM_NODE_READY_FILE || '/root/.continuum/run/node-server.ready'; async function main(): Promise { const activeExample = getActiveExampleName(); const workingDir = `examples/${activeExample}`; console.log(`🐳 Docker node-server starting (example: ${activeExample})`); + await rm(READINESS_FILE, { force: true }); const result = await systemOrchestrator.orchestrate('cli-command', { workingDir, @@ -29,12 +34,14 @@ async function main(): Promise { process.exit(1); } - console.log(`✅ Server ready (milestones: ${result.completedMilestones.join(' → ')})`); + await mkdir(dirname(READINESS_FILE), { recursive: true }); + await writeFile(READINESS_FILE, `${new Date().toISOString()}\n`, 'utf8'); // Seed runs synchronously inside SystemOrchestrator before SERVER_READY // milestone fires (see SystemOrchestrator.ts). No duplicate seed here — // the previous setTimeout(5000) raced the orchestrator's setTimeout(3000) // and could re-enter findOrCreateRoom on a partially-committed table. + console.log(`✅ Server ready (milestones: ${result.completedMilestones.join(' → ')})`); // Keep process alive — server event loop runs in background } diff --git a/src/server/seed-in-process.ts b/src/server/seed-in-process.ts index 456c88f90..6dfdaba9d 100644 --- a/src/server/seed-in-process.ts +++ b/src/server/seed-in-process.ts @@ -295,15 +295,31 @@ async function syncPersonaProviders(_seeder: DatabaseSeeder): Promise { // Vision AI on docker carl ended up running a code model with no // vision capability — see #957. Pass config.modelId through so the // persona seed's declared model survives every resync. + // + // 2026-05-04: PersonaConfig now prefers symbolic modelRef (e.g. + // 'local-default', 'vision-default') over hardcoded modelId. This + // resolves to the CURRENT registry value at seed time so changing + // src/shared/models.json automatically updates seeded personas + // ("update the existing seeded values so the personas PICK UP THE + // MODEL change and arent stuck in the past" — Joel 2026-05-04). + // The reconciler check below + this resolve will UPDATE existing + // rows when the registry changes. const currentModelId = (user as Record).modelConfig ? ((user as Record).modelConfig as Record).model : undefined; - const desiredModelId = config.modelId; + let desiredModelId = config.modelId; + if (!desiredModelId && config.modelRef) { + const { resolveModel, tierFromRamGB } = await import('../shared/ModelRegistry'); + const ramGB = Math.round((require('os').totalmem() / 1024 / 1024 / 1024)); + const tier = tierFromRamGB(ramGB); + const spec = resolveModel(config.modelRef, tier); + desiredModelId = spec.hf_repo; + } const providerChanged = currentProvider !== config.provider; const modelChanged = desiredModelId !== undefined && currentModelId !== desiredModelId; if (providerChanged || modelChanged) { - const newConfig = getModelConfigForProvider(config.provider, config.modelId); + const newConfig = getModelConfigForProvider(config.provider, desiredModelId); await DataUpdate.execute({ collection: 'users', dbHandle: 'default', @@ -381,14 +397,31 @@ export async function seedDatabase(): Promise { const localModel = selectLocalModel(0); const created: Map = new Map(); + // Resolve symbolic modelRef → concrete modelId via ModelRegistry. Each + // persona's stored modelId stays synced with src/shared/models.json so + // changing the registry value updates seeded personas on next startup + // (Joel 2026-05-04: "personas PICK UP THE MODEL change and arent stuck + // in the past"). + const { resolveModel, tierFromRamGB } = await import('../shared/ModelRegistry'); + const seedRamGB = Math.round(require('os').totalmem() / 1024 / 1024 / 1024); + const seedTier = tierFromRamGB(seedRamGB); + for (const config of personas) { try { + let resolvedModelId = config.modelId; + if (!resolvedModelId && config.modelRef) { + try { + resolvedModelId = resolveModel(config.modelRef, seedTier).hf_repo; + } catch (e) { + console.warn(` ⚠️ ${config.displayName}: modelRef '${config.modelRef}' did not resolve: ${e}`); + } + } const user = await seeder.findOrCreateUser( config.uniqueId, config.displayName, config.type === 'agent' ? 'agent' : 'persona', config.provider, - config.modelId, + resolvedModelId, ); created.set(config.uniqueId, user); } catch (err) { diff --git a/src/shared/ModelRegistry.ts b/src/shared/ModelRegistry.ts new file mode 100644 index 000000000..128b4175d --- /dev/null +++ b/src/shared/ModelRegistry.ts @@ -0,0 +1,197 @@ +/** + * ModelRegistry — single source of truth reader for src/shared/models.json. + * + * ALL model lookups go through here. Consumers: + * - src/scripts/seed/personas.ts (resolves persona.modelRef → current modelId) + * - src/daemons/ai-provider-daemon/adapters/candle/CandleAdapter.ts + * (accepts symbolic refs, resolves to concrete model) + * - src/scripts/download-models.sh (reads via jq for tier/auto_download set) + * - install.sh (reads via jq for PERSONA_MODEL tier resolution) + * + * Architectural rule: NEVER hardcode a model ID in code or DB rows. Always + * use a symbolic ref ('local-default', 'vision-default', 'gating') OR a + * registry key ('qwen3.5-4b-code-forged'). Registry edits propagate + * everywhere on next read; seeded data does not need migration. + */ + +import * as fs from 'fs'; +import * as path from 'path'; + +export type ModelKind = 'chat-llm' | 'vision-llm' | 'embedding' | 'stt' | 'tts' | 'tts-trainable' | 'vad' | 'chat-llm-fast'; +export type Tier = 'mba' | 'mid' | 'full'; + +/** + * Canonical symbolic refs that personas store in DB. Code reads these + * constants — never hardcode the underlying strings. Joel rule + * 2026-05-04: "define constants not magic strings". + * + * Adding a new symbolic ref: add the constant here, add the entry to + * src/shared/models.json `symbolic_refs{}`, document below. + */ +export const SYMBOLIC_REFS = { + /** Local chat model — tier-resolved. Resolves to tiers[host_tier].default_chat. */ + LOCAL_DEFAULT: 'local-default', + /** Native-vision model. Currently bound to qwen2-vl-7b. */ + VISION_DEFAULT: 'vision-default', + /** Fast classification/gating model. */ + GATING: 'gating', +} as const; +export type SymbolicRef = typeof SYMBOLIC_REFS[keyof typeof SYMBOLIC_REFS]; + +/** Tier constants — code uses these instead of bare 'mba' / 'mid' / 'full' strings. */ +export const TIERS = { + MBA: 'mba' as const, + MID: 'mid' as const, + FULL: 'full' as const, +}; + +export interface ModelSpec { + kind: ModelKind; + hf_repo: string; + format: string; + architecture?: string; + files?: string[]; + size_gb: number; + min_ram_gb?: number; + chat_template?: string; + description: string; + auto_load?: boolean; +} + +export interface TierSpec { + min_ram_gb: number; + default_chat: string; // registry key + description: string; +} + +interface RegistryFile { + models: Record; + tiers: Record; + symbolic_refs: Record; + personas: Record; + auto_download: { + always: string[]; + by_tier: Record; + }; + chat_templates: Record>; +} + +let _cached: RegistryFile | null = null; + +function load(): RegistryFile { + if (_cached) return _cached; + // Resolve registry across three runtime shapes: + // 1. Compiled: __dirname=dist/shared, JSON copied alongside by build script. + // 2. tsx dev: __dirname=src/shared, JSON sits next to ModelRegistry.ts. + // 3. dist-without-copy: __dirname=dist/shared, source JSON at ../../src/shared/. + // Try each in order so the first one that exists wins. Surface a clear + // error if none — no silent fallback to default model. + const candidates = [ + path.join(__dirname, 'models.json'), + path.join(__dirname, '..', '..', 'src', 'shared', 'models.json'), + path.join(__dirname, '..', '..', '..', 'src', 'shared', 'models.json'), + ]; + let found: string | undefined; + for (const p of candidates) { + if (fs.existsSync(p)) { found = p; break; } + } + if (!found) { + throw new Error( + `ModelRegistry: models.json not found. Tried: ${candidates.join(', ')}. ` + + `Build script must copy shared/models.json → dist/shared/models.json.` + ); + } + const raw = fs.readFileSync(found, 'utf8'); + _cached = JSON.parse(raw) as RegistryFile; + return _cached; +} + +/** + * Pick host tier from total RAM in GB. Same logic as install.sh's + * tier-detection block — kept consistent so install-time and runtime + * resolve to the same default model. + */ +export function tierFromRamGB(ramGB: number): Tier { + if (ramGB >= 32) return 'full'; + if (ramGB >= 24) return 'mid'; + return 'mba'; +} + +/** + * Resolve a symbolic ref ('local-default', 'vision-default', 'gating') OR + * a direct registry key to a concrete ModelSpec. Always reads current + * registry — DB rows storing symbolic refs auto-pick-up registry edits. + */ +export function resolveModel(ref: string, tier?: Tier): ModelSpec { + const reg = load(); + const sym = reg.symbolic_refs[ref]; + if (sym) { + if (sym.by_tier) { + if (!tier) { + throw new Error(`Symbolic ref '${ref}' is tier-dependent but no tier provided.`); + } + const modelKey = reg.tiers[tier].default_chat; + const spec = reg.models[modelKey]; + if (!spec) throw new Error(`Tier '${tier}' default_chat '${modelKey}' not found in models.`); + return spec; + } + if (sym.model) { + const spec = reg.models[sym.model]; + if (!spec) throw new Error(`Symbolic ref '${ref}' → '${sym.model}' not found in models.`); + return spec; + } + } + const direct = reg.models[ref]; + if (direct) return direct; + throw new Error(`Model ref '${ref}' not found (not a symbolic ref nor a registry key).`); +} + +/** + * Resolve a persona's symbolic ref to a concrete model spec. + * `personas.ts` stores symbolic refs in modelRef field; this function + * is what the AI provider chain calls at request time. + */ +export function resolvePersonaModel(personaDisplayName: string, tier: Tier): ModelSpec { + const reg = load(); + const ref = reg.personas[personaDisplayName]; + if (!ref) throw new Error(`No registry entry for persona '${personaDisplayName}'.`); + return resolveModel(ref, tier); +} + +/** + * Set of model registry keys that should be downloaded by model-init for + * a given tier. Used by download-models.sh and integration tests. + */ +export function downloadSetForTier(tier: Tier): string[] { + const reg = load(); + return [...reg.auto_download.always, ...(reg.auto_download.by_tier[tier] || [])]; +} + +/** + * Get all registered persona-displayName → symbolic-ref pairs. Reconciler + * uses this on startup to ensure DB persona rows match current registry. + */ +export function allPersonaRefs(): Record { + return { ...load().personas }; +} + +/** + * Get the symbolic ref a persona should store in DB. + * Use this in seed-in-process.ts when creating/updating persona rows. + */ +export function symbolicRefForPersona(personaDisplayName: string): string | undefined { + return load().personas[personaDisplayName]; +} + +export function getModelSpec(key: string): ModelSpec | undefined { + return load().models[key]; +} + +export function getChatTemplate(name: string): Record | undefined { + return load().chat_templates[name]; +} + +/** Force re-read on next call (test helper). */ +export function _resetCacheForTests(): void { + _cached = null; +} diff --git a/src/shared/generated/inference/ModelRegistry.ts b/src/shared/generated/inference/ModelRegistry.ts index 322c928b2..077d3548e 100644 --- a/src/shared/generated/inference/ModelRegistry.ts +++ b/src/shared/generated/inference/ModelRegistry.ts @@ -2,6 +2,8 @@ import type { ModelRegistryEntry } from "./ModelRegistryEntry"; /** - * Full model registry — maps aliases to model entries. + * Full model registry — mirrors `src/shared/models.json` SSOT shape. + * Extra fields (`personas`, `auto_download`, `chat_templates`) are + * silently ignored by serde for the in-Rust subset we consume here. */ export type ModelRegistry = { models: { [key in string]: ModelRegistryEntry }, }; diff --git a/src/shared/generated/inference/ModelRegistryEntry.ts b/src/shared/generated/inference/ModelRegistryEntry.ts index 297f7b1d1..a7646e83b 100644 --- a/src/shared/generated/inference/ModelRegistryEntry.ts +++ b/src/shared/generated/inference/ModelRegistryEntry.ts @@ -3,14 +3,27 @@ /** * Single source of truth for local model metadata. * - * Model registry entry loaded from model_registry.json (embedded at compile time). - * TypeScript gets these types via ts-rs — NO hand-written duplicates. + * Model registry entry deserialized from src/shared/models.json (embedded at + * compile time). TypeScript gets these types via ts-rs — NO hand-written + * duplicates. + * + * **Schema mirrors `src/shared/ModelRegistry.ts`'s `ModelSpec`** so both + * runtimes read the same JSON. Field names use the new SSOT shape + * (`hf_repo`, `min_ram_gb`); legacy aliases (`repo`, `min_memory_gb`) + * kept via `serde(alias = ...)` so any third-party consumer of the old + * embedded JSON keeps working until it migrates. */ export type ModelRegistryEntry = { /** - * HuggingFace repo ID (canonical source) + * HuggingFace repo ID (canonical source). + * New SSOT field name; `repo` accepted as legacy alias. + */ +hf_repo: string, +/** + * Model kind: "chat-llm", "vision-llm", "embedding", "stt", "tts", "vad". + * Optional for back-compat with the legacy schema. */ -repo: string, +kind?: string, /** * Serialization format: "gguf" or "safetensors" */ @@ -19,15 +32,28 @@ format?: string, * Model architecture: "qwen2", "llama", "phi", etc. */ architecture?: string, +/** + * Files belonging to this model (relative to repo root). + */ +files?: Array, +/** + * Approximate disk footprint in GB. + */ +size_gb?: number, +/** + * Minimum host RAM in GB to run this model. + * New SSOT field name; `min_memory_gb` accepted as legacy alias. + */ +min_ram_gb?: number, /** * Human-readable description */ description?: string, /** - * Minimum GPU memory in GB to run this model + * Chat template name: "qwen2", "llama3", "chatml" */ -min_memory_gb?: number, +chat_template?: string, /** - * Chat template name: "qwen2", "llama3", "chatml" + * Whether this model is auto-loaded at startup (informational). */ -chat_template?: string, }; +auto_load?: boolean, }; diff --git a/src/shared/models.json b/src/shared/models.json new file mode 100644 index 000000000..5bcd6aa21 --- /dev/null +++ b/src/shared/models.json @@ -0,0 +1,186 @@ +{ + "_doc": "Single source of truth for all models the system uses. ALL consumers (install.sh, model-init download scripts, continuum-core Rust loader, persona seed) read from this file. To swap a model: edit ONE entry here. Personas store symbolic refs (e.g. 'local-default', 'vision-default') so changing the registry value automatically picks up everywhere on next inference call — seeded data does NOT need migration.", + "_consumers": [ + "src/shared/ModelRegistry.ts (TS reader)", + "src/workers/continuum-core/src/inference/registry.rs (Rust reader)", + "install.sh (resolves PERSONA_MODEL via tier)", + "src/scripts/download-models.sh (model-init container — downloads all auto_download:true models)", + "src/scripts/seed/personas.ts (resolves symbolic refs to current model on lookup)" + ], + + "models": { + "qwen3.5-0.8b-general": { + "kind": "chat-llm", + "hf_repo": "continuum-ai/qwen3.5-0.8b-general-forged", + "format": "gguf", + "architecture": "qwen3", + "files": ["qwen3.5-0.8b-general-forged-q4_k_m.gguf"], + "size_gb": 0.5, + "min_ram_gb": 16, + "chat_template": "qwen2", + "description": "0.8B general — MBA tier (16-23GB RAM). Chat-functional with headroom." + }, + "qwen3.5-2b-general": { + "kind": "chat-llm", + "hf_repo": "continuum-ai/qwen3.5-2b-general-forged", + "format": "gguf", + "architecture": "qwen3", + "files": ["qwen3.5-2b-general-forged-q4_k_m.gguf"], + "size_gb": 1.4, + "min_ram_gb": 24, + "chat_template": "qwen2", + "description": "2B general — mid tier (24-31GB RAM). Bigger context window." + }, + "qwen3.5-4b-code-forged": { + "kind": "chat-llm", + "hf_repo": "continuum-ai/qwen3.5-4b-code-forged-GGUF", + "format": "gguf", + "architecture": "qwen3", + "files": ["qwen3.5-4b-code-forged-q4_k_m.gguf"], + "size_gb": 2.7, + "min_ram_gb": 32, + "chat_template": "qwen2", + "description": "4B code-forged — full tier (32GB+ RAM). 70%+ HumanEval. Default chat for full-tier devices." + }, + "qwen2-vl-7b": { + "kind": "vision-llm", + "hf_repo": "Qwen/Qwen2-VL-7B-Instruct-GGUF", + "format": "gguf", + "architecture": "qwen2-vl", + "files": ["qwen2-vl-7b-instruct-q4_k_m.gguf", "mmproj-Qwen2-VL-7B-Instruct-f16.gguf"], + "size_gb": 5.0, + "min_ram_gb": 16, + "chat_template": "qwen2", + "description": "Native-vision Qwen2-VL 7B. Persona: Vision AI. mmproj sidecar required for vision encoder." + }, + "AllMiniLML6V2": { + "kind": "embedding", + "hf_repo": "sentence-transformers/all-MiniLM-L6-v2", + "format": "candle-builtin", + "size_gb": 0.09, + "auto_load": true, + "description": "384-dim sentence embedding. Pre-loaded by continuum-core at boot for RAG + semantic search." + }, + "whisper-base-en": { + "kind": "stt", + "hf_repo": "ggerganov/whisper.cpp", + "format": "ggml", + "files": ["ggml-base.en.bin"], + "size_gb": 0.075, + "description": "Whisper base.en — fast STT, ~60-70% accuracy. Voice transcription." + }, + "piper-libritts-r-medium": { + "kind": "tts", + "hf_repo": "rhasspy/piper-voices", + "format": "onnx", + "files": ["en/en_US/libritts_r/medium/en_US-libritts_r-medium.onnx", "en/en_US/libritts_r/medium/en_US-libritts_r-medium.onnx.json"], + "size_gb": 0.063, + "description": "Piper TTS — high-quality voice synthesis." + }, + "kokoro-82m": { + "kind": "tts", + "hf_repo": "onnx-community/Kokoro-82M-v1.0-ONNX", + "format": "onnx", + "files": ["onnx/model_q8f16.onnx", "voices.bin"], + "size_gb": 0.08, + "description": "Kokoro 82M ONNX TTS — high quality, lightweight." + }, + "silero-vad": { + "kind": "vad", + "hf_repo": "onnx-community/silero-vad", + "format": "onnx", + "files": ["onnx/model.onnx"], + "size_gb": 0.002, + "description": "Silero VAD — voice activity detection for live audio." + }, + "orpheus-3b-tts": { + "kind": "tts-trainable", + "hf_repo": "isaiahbjork/orpheus-3b-0.1-ft-Q4_K_M-GGUF", + "format": "gguf", + "files": ["orpheus-3b-0.1-ft-q4_k_m.gguf"], + "size_gb": 2.4, + "description": "Orpheus 3B TTS GGUF — LoRA-trainable voice cloning." + }, + "qwen2-0.5b-gating": { + "kind": "chat-llm-fast", + "hf_repo": "Qwen/Qwen2-0.5B-Instruct", + "format": "safetensors", + "architecture": "qwen2", + "size_gb": 0.5, + "chat_template": "qwen2", + "description": "Tiny gating/classification model. Fast, low-latency decisions before full inference." + }, + "coder": { + "kind": "chat-llm", + "hf_repo": "continuum-ai/qwen2.5-coder-14b-compacted", + "format": "gguf", + "architecture": "qwen2", + "size_gb": 9.0, + "min_ram_gb": 12, + "chat_template": "qwen2", + "description": "Coding agent — Qwen2.5-Coder-14B compacted (Q5_K_S, 9GB). Used by LocalModelRouter via LOCAL_MODELS.CODING_AGENT." + }, + "coder-bf16": { + "kind": "chat-llm", + "hf_repo": "continuum-ai/qwen2.5-coder-14b-compacted", + "format": "safetensors", + "architecture": "qwen2", + "size_gb": 28.0, + "min_ram_gb": 32, + "chat_template": "qwen2", + "description": "Coding agent BF16 batch-prefill variant — explicitly selects safetensors backend (32GB+)." + } + }, + + "tiers": { + "mba": { "min_ram_gb": 16, "default_chat": "qwen3.5-0.8b-general", "description": "MacBook Air / 16-23GB RAM. Chat-only OOTB, minimal footprint." }, + "mid": { "min_ram_gb": 24, "default_chat": "qwen3.5-2b-general", "description": "Mid-tier 24-31GB. Larger context window viable." }, + "full": { "min_ram_gb": 32, "default_chat": "qwen3.5-4b-code-forged", "description": "32GB+. Full multimodal experience including vision." } + }, + + "symbolic_refs": { + "local-default": { "_doc": "Personas with provider:local for chat. Resolved per-tier at request time.", "by_tier": true }, + "vision-default": { "_doc": "Personas needing native-vision. Independent of tier.", "model": "qwen2-vl-7b" }, + "gating": { "_doc": "Fast classification model.", "model": "qwen2-0.5b-gating" } + }, + + "personas": { + "_doc": "Persona displayName → symbolic ref. seed-in-process.ts uses these. Reconciler updates DB rows on startup if a persona's modelRef is missing or changed.", + "Helper AI": "local-default", + "Teacher AI": "local-default", + "CodeReview AI": "local-default", + "Local Assistant": "local-default", + "Vision AI": "vision-default" + }, + + "auto_download": { + "_doc": "Models that model-init container should pre-pull at first compose-up. Runs on every host (Mac/Linux/Windows) — replaces the Mac-only `docker model pull` flow which had no Linux equivalent.", + "always": ["AllMiniLML6V2", "whisper-base-en", "piper-libritts-r-medium", "kokoro-82m", "silero-vad"], + "by_tier": { + "mba": ["qwen3.5-0.8b-general"], + "mid": ["qwen3.5-2b-general"], + "full": ["qwen3.5-4b-code-forged", "qwen2-vl-7b"] + } + }, + + "chat_templates": { + "qwen2": { + "system": "<|im_start|>system\n{system}<|im_end|>\n", + "user": "<|im_start|>user\n{content}<|im_end|>\n", + "assistant": "<|im_start|>assistant\n", + "eos": "<|im_end|>" + }, + "llama3": { + "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>", + "user": "<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>", + "assistant": "<|start_header_id|>assistant<|end_header_id|>\n\n", + "eos": "<|eot_id|>" + }, + "chatml": { + "system": "<|im_start|>system\n{system}<|im_end|>\n", + "user": "<|im_start|>user\n{content}<|im_end|>\n", + "assistant": "<|im_start|>assistant\n", + "eos": "<|im_end|>" + } + } +} diff --git a/src/system/orchestration/SystemOrchestrator.ts b/src/system/orchestration/SystemOrchestrator.ts index 99158cff4..7bc8077a9 100644 --- a/src/system/orchestration/SystemOrchestrator.ts +++ b/src/system/orchestration/SystemOrchestrator.ts @@ -1116,17 +1116,21 @@ export class SystemOrchestrator extends EventEmitter { // after install completed and intermittently hit "Room not found: general" // because rooms hadn't landed yet. Awaiting seed here closes that race — // by the time downstream sees SERVER_READY, rooms+personas exist. + // + // Throws (not warns) on failure: chat/send, room routing, persona + // allocation, and Carl's first-page experience all require seeded + // rooms/users to exist. A warn-and-continue path just masks the + // real failure — observed in run 25403866714 where the smoke saw + // 'general room not present after 60s' as a soft warning while the + // actual seed had silently broken upstream. Loud failure surfaces + // the bug per Joel's no-suppression rule. try { const { seedDatabase } = await import('../../server/seed-in-process'); const seeded = await seedDatabase(); - if (seeded) { - console.log('✅ Database seeded (in-process)'); - } else { - console.log('✅ Database already seeded'); - } + console.log(seeded ? '✅ Database seeded (in-process)' : '✅ Database already seeded'); } catch (e: unknown) { const msg = e instanceof Error ? e.message : String(e); - console.warn(`⚠️ Auto-seed failed: ${msg}`); + throw new Error(`Auto-seed failed before server readiness: ${msg}`); } await milestoneEmitter.completeMilestone( @@ -1461,4 +1465,4 @@ export class SystemOrchestrator extends EventEmitter { /** * Global orchestrator instance */ -export const systemOrchestrator = new SystemOrchestrator(); \ No newline at end of file +export const systemOrchestrator = new SystemOrchestrator(); diff --git a/src/workers/continuum-core/src/inference/candle_adapter.rs b/src/workers/continuum-core/src/inference/candle_adapter.rs index 19d188d62..f95f9ec04 100644 --- a/src/workers/continuum-core/src/inference/candle_adapter.rs +++ b/src/workers/continuum-core/src/inference/candle_adapter.rs @@ -951,34 +951,84 @@ impl AIProviderAdapter for CandleAdapter { /// Single source of truth for local model metadata. /// -/// Model registry entry loaded from model_registry.json (embedded at compile time). -/// TypeScript gets these types via ts-rs — NO hand-written duplicates. +/// Model registry entry deserialized from src/shared/models.json (embedded at +/// compile time). TypeScript gets these types via ts-rs — NO hand-written +/// duplicates. +/// +/// **Schema mirrors `src/shared/ModelRegistry.ts`'s `ModelSpec`** so both +/// runtimes read the same JSON. Field names use the new SSOT shape +/// (`hf_repo`, `min_ram_gb`); legacy aliases (`repo`, `min_memory_gb`) +/// kept via `serde(alias = ...)` so any third-party consumer of the old +/// embedded JSON keeps working until it migrates. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize, ts_rs::TS)] #[ts( export, export_to = "../../../shared/generated/inference/ModelRegistryEntry.ts" )] pub struct ModelRegistryEntry { - /// HuggingFace repo ID (canonical source) - pub repo: String, + /// HuggingFace repo ID (canonical source). + /// New SSOT field name; `repo` accepted as legacy alias. + #[serde(alias = "repo")] + pub hf_repo: String, + /// Model kind: "chat-llm", "vision-llm", "embedding", "stt", "tts", "vad". + /// Optional for back-compat with the legacy schema. + #[ts(optional)] + #[serde(default)] + pub kind: Option, /// Serialization format: "gguf" or "safetensors" #[ts(optional)] + #[serde(default)] pub format: Option, /// Model architecture: "qwen2", "llama", "phi", etc. #[ts(optional)] + #[serde(default)] pub architecture: Option, + /// Files belonging to this model (relative to repo root). + #[ts(optional, type = "Array")] + #[serde(default)] + pub files: Option>, + /// Approximate disk footprint in GB. + #[ts(optional, type = "number")] + #[serde(default)] + pub size_gb: Option, + /// Minimum host RAM in GB to run this model. + /// New SSOT field name; `min_memory_gb` accepted as legacy alias. + #[ts(optional, type = "number")] + #[serde(default, alias = "min_memory_gb")] + pub min_ram_gb: Option, /// Human-readable description #[ts(optional)] + #[serde(default)] pub description: Option, - /// Minimum GPU memory in GB to run this model - #[ts(optional, type = "number")] - pub min_memory_gb: Option, /// Chat template name: "qwen2", "llama3", "chatml" #[ts(optional)] + #[serde(default)] pub chat_template: Option, + /// Whether this model is auto-loaded at startup (informational). + #[ts(optional)] + #[serde(default)] + pub auto_load: Option, } -/// Full model registry — maps aliases to model entries. +/// Tier specification used by symbolic-ref resolution. +#[derive(Debug, Clone, serde::Deserialize, Default)] +#[serde(default)] +struct TierSpec { + pub default_chat: String, +} + +/// Symbolic ref: either tier-bound (resolves via `tiers[host_tier].default_chat`) +/// or model-bound (resolves to the named registry key directly). +#[derive(Debug, Clone, serde::Deserialize, Default)] +#[serde(default)] +struct SymbolicRefSpec { + pub by_tier: bool, + pub model: Option, +} + +/// Full model registry — mirrors `src/shared/models.json` SSOT shape. +/// Extra fields (`personas`, `auto_download`, `chat_templates`) are +/// silently ignored by serde for the in-Rust subset we consume here. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize, ts_rs::TS)] #[ts( export, @@ -988,40 +1038,134 @@ pub struct ModelRegistry { pub models: HashMap, } -/// Load the model registry from the embedded JSON. -pub fn load_registry() -> ModelRegistry { - let json = include_str!("model_registry.json"); - serde_json::from_str(json).unwrap_or_else(|e| { - runtime::logger("candle").error(&format!("Failed to parse model registry: {e}")); - ModelRegistry { +/// Internal full-shape view used for symbolic-ref + tier resolution. +/// Not exported to TS (TS has its own ModelRegistry.ts reader for this). +#[derive(Debug, Clone, serde::Deserialize)] +struct FullRegistry { + pub models: HashMap, + #[serde(default)] + pub tiers: HashMap, + #[serde(default)] + pub symbolic_refs: HashMap, +} + +/// Embedded SSOT registry. Path is relative to *this file*: +/// workers/continuum-core/src/inference/candle_adapter.rs +/// → ../../../../shared/models.json (= src/shared/models.json) +/// Joel rule 2026-05-04: "we MUST have this work from ONE source of truth". +const REGISTRY_JSON: &str = include_str!("../../../../shared/models.json"); + +fn load_full_registry() -> FullRegistry { + serde_json::from_str(REGISTRY_JSON).unwrap_or_else(|e| { + runtime::logger("candle").error(&format!( + "Failed to parse src/shared/models.json: {e}" + )); + FullRegistry { models: HashMap::new(), + tiers: HashMap::new(), + symbolic_refs: HashMap::new(), } }) } +/// Load the model registry from the embedded JSON (legacy public API — +/// returns the lower-fidelity `ModelRegistry` view for back-compat). +pub fn load_registry() -> ModelRegistry { + ModelRegistry { + models: load_full_registry().models, + } +} + +/// Pick host tier from total RAM. Mirrors the TS `tierFromRamGB` logic +/// in `src/shared/ModelRegistry.ts` so install-time and runtime resolve +/// to the same default model. +fn tier_from_host_ram() -> &'static str { + let bytes = sysinfo_total_memory_bytes(); + let gb = (bytes / 1024 / 1024 / 1024) as u32; + if gb >= 32 { + "full" + } else if gb >= 24 { + "mid" + } else { + "mba" + } +} + +/// Total host memory in bytes. Cheap to call repeatedly; caller decides cache. +fn sysinfo_total_memory_bytes() -> u64 { + // Minimal probe — avoids pulling in a sysinfo dep just for this. + // Linux: /proc/meminfo. macOS: sysctl hw.memsize. Fallback: 16GB so + // we land on the "mba" tier (smallest model) rather than crashing. + #[cfg(target_os = "linux")] + { + if let Ok(s) = std::fs::read_to_string("/proc/meminfo") { + for line in s.lines() { + if let Some(rest) = line.strip_prefix("MemTotal:") { + if let Some(kb_str) = rest.trim().split_whitespace().next() { + if let Ok(kb) = kb_str.parse::() { + return kb * 1024; + } + } + } + } + } + } + #[cfg(target_os = "macos")] + { + use std::process::Command; + if let Ok(out) = Command::new("sysctl").args(["-n", "hw.memsize"]).output() { + if let Ok(s) = String::from_utf8(out.stdout) { + if let Ok(b) = s.trim().parse::() { + return b; + } + } + } + } + 16 * 1024 * 1024 * 1024 +} + pub fn resolve_model_id(requested: &str) -> String { - // Already a HuggingFace repo ID + // Already a HuggingFace repo ID — pass through. if requested.contains('/') { return requested.to_string(); } let normalized = requested.trim().to_lowercase(); - let registry = load_registry(); + let reg = load_full_registry(); + + // 1. Symbolic ref ('local-default', 'vision-default', 'gating') — resolve + // via tiers + symbolic_refs. Reads current registry on every call so + // DB rows storing symbolic refs auto-pick-up registry edits. + if let Some(sym) = reg.symbolic_refs.get(&normalized) { + if sym.by_tier { + let tier = tier_from_host_ram(); + if let Some(t) = reg.tiers.get(tier) { + if let Some(entry) = reg.models.get(&t.default_chat) { + return entry.hf_repo.clone(); + } + } + } else if let Some(model_key) = sym.model.as_deref() { + if let Some(entry) = reg.models.get(model_key) { + return entry.hf_repo.clone(); + } + } + } - // Look up in registry (supports "coder", "smollm2:1.7b", "llama3.2:3b", etc.) - if let Some(entry) = registry.models.get(&normalized) { - return entry.repo.clone(); + // 2. Direct registry key lookup ('coder', 'qwen2-vl-7b', 'qwen3.5-4b-code-forged'). + if let Some(entry) = reg.models.get(&normalized) { + return entry.hf_repo.clone(); } - // Try with common alias patterns: "smollm2-1.7b" → "smollm2:1.7b" + // 3. Common alias pattern: 'smollm2-1.7b' → 'smollm2:1.7b'. let dash_to_colon = normalized.replacen('-', ":", 1); - if let Some(entry) = registry.models.get(&dash_to_colon) { - return entry.repo.clone(); + if let Some(entry) = reg.models.get(&dash_to_colon) { + return entry.hf_repo.clone(); } - // Fallback: treat as HF repo ID + // 4. Fallback: treat as HF repo ID. Loud so unknown models stay diagnosable. runtime::logger("candle").warn(&format!( - "Model '{}' not in registry — treating as HuggingFace repo ID", + "Model '{}' not in registry (no symbolic ref, no key match) — \ + treating as HuggingFace repo ID", requested )); requested.to_string() @@ -1502,11 +1646,43 @@ mod tests { #[test] fn test_resolve_chat_template() { + // Live registry keys (post-SSOT migration to src/shared/models.json). assert_eq!(resolve_chat_template("coder"), "qwen2"); - assert_eq!(resolve_chat_template("coder-14b"), "qwen2"); - assert_eq!(resolve_chat_template("coder-32b"), "qwen2"); - assert_eq!(resolve_chat_template("llama3.2:3b"), "llama3"); - assert_eq!(resolve_chat_template("smollm2"), "chatml"); + assert_eq!(resolve_chat_template("coder-bf16"), "qwen2"); + assert_eq!(resolve_chat_template("qwen3.5-4b-code-forged"), "qwen2"); + assert_eq!(resolve_chat_template("qwen2-vl-7b"), "qwen2"); + // Heuristic fallback: name-based inference for unknown models. + assert_eq!(resolve_chat_template("some-qwen-thing"), "qwen2"); + assert_eq!(resolve_chat_template("smollm2-future"), "chatml"); assert_eq!(resolve_chat_template("unknown-model"), "llama3"); // default fallback } + + #[test] + fn test_resolve_model_id_symbolic_refs() { + // Symbolic refs resolve via src/shared/models.json. Tier resolves + // from host RAM at runtime — we only assert that resolution + // succeeds (non-passthrough) for tier-bound refs and that + // model-bound refs always resolve to the same concrete model. + let local = resolve_model_id("local-default"); + assert_ne!(local, "local-default", "local-default must resolve to a concrete repo"); + assert!(local.contains('/'), "resolved model must look like an HF repo: got {local}"); + + let vision = resolve_model_id("vision-default"); + assert_eq!(vision, "Qwen/Qwen2-VL-7B-Instruct-GGUF"); + + let gating = resolve_model_id("gating"); + assert_eq!(gating, "Qwen/Qwen2-0.5B-Instruct"); + + // Direct registry-key lookup. + assert_eq!( + resolve_model_id("coder"), + "continuum-ai/qwen2.5-coder-14b-compacted" + ); + + // Pass-through for raw HF IDs. + assert_eq!( + resolve_model_id("Qwen/Qwen2-7B-Instruct"), + "Qwen/Qwen2-7B-Instruct" + ); + } } diff --git a/src/workers/continuum-core/src/inference/model_registry.json b/src/workers/continuum-core/src/inference/model_registry.json deleted file mode 100644 index c3f77c944..000000000 --- a/src/workers/continuum-core/src/inference/model_registry.json +++ /dev/null @@ -1,97 +0,0 @@ -{ - "_comment": "Model registry: aliases → HuggingFace repos. Continuum auto-downloads on first use.", - "models": { - "coder": { - "repo": "continuum-ai/qwen2.5-coder-14b-compacted", - "format": "gguf", - "architecture": "qwen2", - "description": "14B coding model, compacted (25Q/5KV), Q5_K_S. Fits 16GB MacBook Air.", - "min_memory_gb": 12, - "chat_template": "qwen2" - }, - "coder-14b": { - "repo": "continuum-ai/qwen2.5-coder-14b-compacted", - "format": "gguf", - "architecture": "qwen2", - "description": "14B coding model for 16GB+ devices", - "min_memory_gb": 12, - "chat_template": "qwen2" - }, - "coder-32b": { - "repo": "continuum-ai/qwen2.5-coder-32b-compacted", - "format": "gguf", - "architecture": "qwen2", - "description": "32B coding model for 32GB+ devices. Needs QAT for full quality.", - "min_memory_gb": 20, - "chat_template": "qwen2" - }, - "smollm2": { - "repo": "HuggingFaceTB/SmolLM2-135M-Instruct", - "format": "safetensors", - "architecture": "llama", - "description": "135M tiny model for testing", - "min_memory_gb": 1, - "chat_template": "chatml" - }, - "smollm2:1.7b": { - "repo": "HuggingFaceTB/SmolLM2-1.7B-Instruct", - "format": "safetensors", - "architecture": "llama", - "description": "1.7B small model", - "min_memory_gb": 4, - "chat_template": "chatml" - }, - "llama3.2:3b": { - "repo": "unsloth/Llama-3.2-3B-Instruct", - "format": "safetensors", - "architecture": "llama", - "description": "3B general model", - "min_memory_gb": 6, - "chat_template": "llama3" - }, - "qwen2.5-coder:32b": { - "repo": "Qwen/Qwen2.5-Coder-32B-Instruct", - "format": "safetensors", - "architecture": "qwen2", - "description": "Full 32B (uncompacted, needs 80GB+)", - "min_memory_gb": 70, - "chat_template": "qwen2" - }, - "continuum-ai/qwen3.5-4b-code-forged": { - "repo": "continuum-ai/qwen3.5-4b-code-forged-GGUF", - "format": "gguf", - "architecture": "qwen3", - "description": "4B code model, forged with experiential plasticity. 70%+ HumanEval. 2.6GB Q4_K_M.", - "min_memory_gb": 3, - "chat_template": "qwen2" - }, - "continuum-ai/qwen3.5-27b-code-forged": { - "repo": "continuum-ai/qwen3.5-27b-code-forged", - "format": "safetensors", - "architecture": "qwen3", - "description": "27B code model, forged with experiential plasticity. Needs 17GB+ VRAM.", - "min_memory_gb": 17, - "chat_template": "qwen2" - } - }, - "chat_templates": { - "qwen2": { - "system": "<|im_start|>system\n{system}<|im_end|>\n", - "user": "<|im_start|>user\n{content}<|im_end|>\n", - "assistant": "<|im_start|>assistant\n", - "eos": "<|im_end|>" - }, - "llama3": { - "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>", - "user": "<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>", - "assistant": "<|start_header_id|>assistant<|end_header_id|>\n\n", - "eos": "<|eot_id|>" - }, - "chatml": { - "system": "<|im_start|>system\n{system}<|im_end|>\n", - "user": "<|im_start|>user\n{content}<|im_end|>\n", - "assistant": "<|im_start|>assistant\n", - "eos": "<|im_end|>" - } - } -}