CambrianTech · joelteply · May 5, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/.github/workflows/carl-install-smoke.yml b/.github/workflows/carl-install-smoke.yml
@@ -45,6 +45,10 @@ on:
         description: 'Git ref to fetch install.sh from (sha / branch / tag)'
         required: false
         default: ''
+      image_tag:
+        description: 'Docker image tag to pull (default: canary). Useful values: canary, latest, pr-<N>, <sha-prefix>.'
+        required: false
+        default: 'canary'
 
 jobs:
   carl-install-smoke-amd64:
@@ -68,15 +72,46 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
+      - name: Install mesa-vulkan-drivers (llvmpipe ICD for no-GPU CI runner)
+        # The default continuum-core-vulkan binary calls Vulkan via the loader.
+        # On ubuntu-latest there's no GPU hardware → no real ICD → loader returns
+        # zero devices → binary panics per Joel's "lack of GPU integration is
+        # forbidden" rule. mesa-vulkan-drivers installs the llvmpipe software
+        # ICD so the loader returns a (software) device, the binary sees a real
+        # Vulkan API surface, and the GPU code path is exercised exactly like
+        # it would be on a hardware-GPU host. vulkan-tools provides vulkaninfo
+        # for the slice probes (test-slices.sh).
+        run: |
+          sudo apt-get update -y
+          sudo apt-get install -y mesa-vulkan-drivers vulkan-tools
+          echo "vulkaninfo summary:"
+          vulkaninfo --summary 2>&1 | head -20 || true
+
       - name: Login to ghcr.io (so install.sh can pull pre-built images)
         run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
 
       - name: Run carl-install smoke
         env:
-          # Pass the PR HEAD sha so the smoke fetches the install.sh from
-          # THIS PR (not main). Falls back to manual workflow_dispatch input
-          # when not in a PR context.
+          # PR HEAD sha so smoke fetches install.sh from THIS PR.
           CARL_INSTALL_REF: ${{ github.event.pull_request.head.sha || inputs.install_ref || github.sha }}
+          # Pin docker images to :pr-N (PR-scoped, mutable per push). Refreshed
+          # by push-image.sh on every dev push, so always reflects this PR's
+          # latest source — but never collides with another PR or canary.
+          # Slices the dev didn't push directly are aliased from :canary by the
+          # dev script (manifest copy, no rebuild). :latest was the prior
+          # default and went 9-14 days stale in April 2026 — never use it for
+          # smoke.
+          #
+          # Resolution priority: PR# > input.image_tag > 'canary'.
+          # On workflow_dispatch (no PR context) the bare `pr-${{ ... }}`
+          # interpolated to 'pr-' (empty after dash), causing install.sh to
+          # miss the registry and fall back to 'will build locally' — which
+          # then ran a full Rust compile of continuum-core-vulkan on the
+          # no-GPU runner and hit the 25-min runner cap (observed run
+          # 25400718464). The conditional below makes manual triggers
+          # default to the canary tag (the cadence we publish on) and lets
+          # operators override via the image_tag input from the UI.
+          CONTINUUM_IMAGE_TAG: ${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || inputs.image_tag || 'canary' }}
           # 25-min cap on the docker-only install. Hybrid (Mac source-build)
           # path would exceed this — by design, that's the gate firing on
           # the README/install mismatch.
@@ -91,14 +126,44 @@ jobs:
           SKIP_TEARDOWN: '0'
         run: bash scripts/ci/carl-install-smoke.sh
 
-      - name: Upload install + page + chat artifacts on failure
+      - name: Capture docker logs from all containers on failure (continuum-core,
+          node-server, model-init, widget-server, livekit-bridge)
+        if: failure()
+        run: |
+          # Find the carl-smoke compose project and dump every container's
+          # logs. Without this we get install.log + page + chat — all OUTSIDE
+          # the containers — but never see WHY continuum-core / node-server
+          # didn't reply (silent inference failure was the actual blocker
+          # 2026-05-04 on PR #1038). Capture per-container so the artifact
+          # shows the inference path, not just the smoke wrapper output.
+          set +e
+          for dir in /tmp/carl-smoke-*; do
+            [ -d "$dir" ] || continue
+            [ -f "$dir/docker-compose.yml" ] || continue
+            for svc in continuum-core node-server model-init widget-server livekit-bridge; do
+              docker compose -f "$dir/docker-compose.yml" logs --no-color --timestamps "$svc" \
+                > "${dir}.${svc}.log" 2>&1
+              docker compose -f "$dir/docker-compose.yml" ps "$svc" \
+                > "${dir}.${svc}.ps" 2>&1
+            done
+            docker compose -f "$dir/docker-compose.yml" ps -a > "${dir}.compose-ps.log" 2>&1
+          done
+      - name: Upload install + page + chat + docker logs + screenshot artifacts on failure
         if: failure()
         uses: actions/upload-artifact@v4
         with:
           name: carl-install-debug-${{ github.event.pull_request.head.sha || github.sha }}
           path: |
             /tmp/carl-smoke-*.install.log
             /tmp/carl-smoke-*.page.html
+            /tmp/carl-smoke-*.page.png
             /tmp/carl-smoke-*.chat.log
+            /tmp/carl-smoke-*.continuum-core.log
+            /tmp/carl-smoke-*.node-server.log
+            /tmp/carl-smoke-*.model-init.log
+            /tmp/carl-smoke-*.widget-server.log
+            /tmp/carl-smoke-*.livekit-bridge.log
+            /tmp/carl-smoke-*.compose-ps.log
+            /tmp/carl-smoke-*.*.ps
           retention-days: 7
           if-no-files-found: ignore
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -67,18 +67,31 @@ services:
       - WHISPER_MODEL=${WHISPER_MODEL:-base}
 
   # ── Continuum Core (Rust) ─────────────────────────────────
+  # Default uses the vulkan variant: software rendering via mesa's llvmpipe ICD
+  # when no GPU hardware is present, real driver ICD (NVIDIA/Intel/AMD) when one
+  # is. Joel's 2026-04-23 architectural rule: "lack of GPU integration is
+  # forbidden". The previous CPU-only 'core' variant violated that by panicking
+  # on no-GPU per gpu/memory_manager.rs:757. Vulkan-with-llvmpipe satisfies the
+  # rule (binary exercises the GPU API loader; llvmpipe answers the queries via
+  # software rasterizer). Removed in #1038 (Task #98) — see
+  # docs/INSTALL-ARCHITECTURE.md.
+  #
+  # CUDA hosts overlay docker-compose.gpu.yml to swap in continuum-core-cuda for
+  # NVIDIA-accelerated inference. Mac runs continuum-core natively (overlay
+  # docker-compose.mac.yml sets replicas:0 here).
   continuum-core:
     build:
       context: ./src/workers
-      dockerfile: ../../docker/continuum-core.Dockerfile
+      dockerfile: ../../docker/continuum-core-vulkan.Dockerfile
       additional_contexts:
         avatars: ./src/models/avatars
         shared-generated: ./src/shared/generated
       args:
         # --no-default-features excludes livekit-webrtc (handled by livekit-bridge).
         # load-dynamic-ort loads ONNX Runtime as shared lib (runtime discovery).
-        GPU_FEATURES: "--no-default-features --features load-dynamic-ort"
-    image: ghcr.io/cambriantech/continuum-core:${CONTINUUM_IMAGE_TAG:-latest}
+        # vulkan feature wires through to llama.cpp's GGML_VULKAN backend.
+        GPU_FEATURES: "--no-default-features --features load-dynamic-ort,vulkan"
+    image: ghcr.io/cambriantech/continuum-core-vulkan:${CONTINUUM_IMAGE_TAG:-latest}
     restart: unless-stopped
     # Sized for mission: Qwen 4-8B Q4 + KV cache for 5 personas + embeddings
     # + Bevy render + vision + audio. Auto-calculated by install.sh from host
@@ -199,7 +212,8 @@ services:
     restart: unless-stopped
     mem_limit: 512m
     depends_on:
-      - node-server
+      node-server:
+        condition: service_healthy
     ports:
       - "9003:9003"   # HTTP
     volumes:

diff --git a/docker/model-init.Dockerfile b/docker/model-init.Dockerfile
@@ -12,24 +12,30 @@ FROM node:20-slim
 LABEL org.opencontainers.image.source=https://github.com/CambrianTech/continuum
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl unzip bash ca-certificates \
+    curl unzip bash ca-certificates jq \
     && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /app
 
-# Copy download scripts and their shared dependencies
-COPY scripts/download-voice-models.sh scripts/download-voice-models.sh
+# Single source of truth for ALL models the system uses (chat / vision /
+# embedding / STT / TTS / VAD). Per Joel 2026-05-04:
+# "we MUST have this work from ONE source of truth"
+COPY shared/models.json shared/models.json
+COPY scripts/download-models.sh scripts/download-models.sh
+# Avatar download (VRM files) — distinct from ML models, kept separate for now.
 COPY scripts/download-avatar-models.sh scripts/download-avatar-models.sh
 COPY scripts/generate-scene-models.ts scripts/generate-scene-models.ts
 COPY scripts/shared/ scripts/shared/
 COPY package.json package.json
 
-RUN chmod +x scripts/download-voice-models.sh scripts/download-avatar-models.sh
+RUN chmod +x scripts/download-models.sh scripts/download-avatar-models.sh
 
-# MODELS_DIR is set by docker-compose.yml to /models (the volume mount)
 ENV MODELS_DIR=/models
-
-# Download voice models (whisper, piper, kokoro, orpheus, vad)
-# then avatar models (VRM files)
-# Scene generation requires tsx — skip in init, handled by npm start
-CMD bash scripts/download-voice-models.sh && bash scripts/download-avatar-models.sh
+ENV REGISTRY=/app/shared/models.json
+
+# Download all models from src/shared/models.json (chat-LLM tier-default,
+# embeddings, STT, TTS, VAD) then avatar models. Per Joel 2026-05-04:
+# "all the models must download and run on GPU" — no DMR dependency.
+# continuum-core loads chat LLMs via its built-in llama.cpp + host GPU
+# (Metal / CUDA / Vulkan ICD).
+CMD bash scripts/download-models.sh && bash scripts/download-avatar-models.sh
diff --git a/docker/node-server.Dockerfile b/docker/node-server.Dockerfile
@@ -27,6 +27,6 @@ VOLUME ["/root/.continuum"]
 EXPOSE 9000 9001
 
 HEALTHCHECK --interval=10s --timeout=5s --start-period=30s --retries=3 \
-    CMD node -e "const s=require('net').connect(9001,'localhost',()=>{s.end();process.exit(0)});s.on('error',()=>process.exit(1))"
+    CMD test -f /root/.continuum/run/node-server.ready && node -e "const s=require('net').connect(9001,'localhost',()=>{s.end();process.exit(0)});s.on('error',()=>process.exit(1))"
 
 CMD ["npx", "tsx", "server/docker-entrypoint.ts"]
diff --git a/install.sh b/install.sh
@@ -425,12 +425,14 @@ EOF
   esac
   case "$IC_GPU_PATH" in
     dmr-*)
-      if ! docker model ls 2>/dev/null | grep -q "qwen3.5-4b-code-forged"; then
-        info "Pulling default persona model into Docker Model Runner (~2.7GB, first install only)..."
-        docker model pull "$PERSONA_MODEL" || warn "Model pull failed — chat will error until model is available. Retry: docker model pull $PERSONA_MODEL"
-      else
-        ok "Persona model already in DMR: $PERSONA_MODEL"
-      fi
+      # Per Joel 2026-05-04: "all the models must download and run on GPU"
+      # + "we MUST have this work from ONE source of truth". DMR's
+      # `docker model pull` was the Mac-only path that didn't work on
+      # Linux. Models now download via the model-init container reading
+      # src/shared/models.json — same path on Mac/Linux/Windows. The DMR
+      # branch here remains for KV-cache-config + vLLM-MLX install (which
+      # are still useful tuning), but no longer pulls the model.
+      ok "Persona model download deferred to model-init container (reads src/shared/models.json)"
       # Cap llama-server's per-slot KV cache reservation, sized to actual
       # physical RAM. Without this cap each slot reserves the full model
       # context (262144 tokens for Qwen3.5), ballooning
@@ -483,11 +485,10 @@ EOF
             # Pull MLX-format Qwen3.5-4B for vllm-metal routing.
             # DMR auto-routes MLX models to vllm-metal when installed.
             MLX_MODEL="hf.co/mlx-community/Qwen3.5-4B-MLX-4bit"
-            if ! docker model ls 2>/dev/null | grep -q "Qwen3.5-4B-MLX"; then
-              info "Pulling MLX-format Qwen3.5-4B (~2.5GB, for 3x faster inference)..."
-              docker model pull "$MLX_MODEL" \
-                || warn "MLX model pull failed. GGUF via llama.cpp will be used instead."
-            fi
+            # MLX-format model also moves to registry-driven download.
+            # Add MLX entry to src/shared/models.json + auto_download.always
+            # if/when we want vllm-metal to find it on disk.
+            ok "MLX model download deferred to model-init (add to src/shared/models.json to enable)"
           else
             warn "vLLM install failed (requires Docker Desktop 4.62+). llama.cpp Metal will be used."
           fi
@@ -887,10 +888,25 @@ elif [[ "$HAS_GPU" == "true" ]]; then
   if [ -f "docker-compose.gpu.yml" ]; then
     COMPOSE_FILES="$COMPOSE_FILES -f docker-compose.gpu.yml"
   else
-    warn "docker-compose.gpu.yml missing — GPU detected but cuda override won't apply. Continuing on CPU images."
+    warn "docker-compose.gpu.yml missing — GPU detected but cuda override won't apply. Continuing on Vulkan base image (still GPU-API; will use llvmpipe ICD if no vulkan driver)."
   fi
   COMPOSE_ARGS="--profile gpu"
 fi
+# Linux without a CUDA GPU: base docker-compose.yml uses continuum-core-vulkan.
+# On real-driver hosts (Intel/AMD with vulkan) this picks up the hardware ICD;
+# on hosts without a driver, mesa-vulkan-drivers (apt) provides llvmpipe as a
+# software ICD so the Vulkan code path runs without panicking. Joel's
+# 2026-04-23 rule: GPU integration is forbidden to fall back. Vulkan-via-
+# llvmpipe is GPU integration (loader + ICD), not a CPU fallback.
+if [[ "$OS" == "Linux" ]] && [[ "$HAS_GPU" != "true" ]]; then
+  if ! command -v vulkaninfo >/dev/null 2>&1; then
+    warn "vulkaninfo not found — install mesa-vulkan-drivers vulkan-tools so the Vulkan loader has the llvmpipe software ICD: sudo apt-get install -y mesa-vulkan-drivers vulkan-tools"
+  elif ! vulkaninfo --summary 2>/dev/null | grep -qE "deviceName"; then
+    warn "Vulkan loader present but enumerated zero devices. continuum-core-vulkan will panic on startup. Install: sudo apt-get install -y mesa-vulkan-drivers"
+  else
+    info "Vulkan loader OK — will use $(vulkaninfo --summary 2>/dev/null | grep -E 'deviceName' | head -1 | sed 's/.*= *//')"
+  fi
+fi
 
 # ── 7. Pull support-service images ─────────────────────────
 PHASE="pull images"
@@ -1044,6 +1060,38 @@ for i in $(seq 1 "$HEALTH_TIMEOUT_SEC"); do
   sleep 1
 done
 
+# ── 8c. Wait for node-server seed to populate the default room ──────
+# widget-server /health on port 9003 only proves that container is up.
+# node-server (port 9001) runs auto-seed in docker-entrypoint.ts which
+# creates the "general" room + personas. If the user opens the page or
+# chat probe runs BEFORE seed completes, chat/send returns "Room not
+# found: general" or "User not found" silently. Probe directly for the
+# general room via jtag — fast, no new endpoint needed, deterministic.
+# Caught by carl-install-smoke 2026-05-04 (PR #1038).
+SEED_TIMEOUT_SEC="${SEED_TIMEOUT_SEC:-60}"
+JTAG_BIN="$(command -v jtag 2>/dev/null || true)"
+[ -z "$JTAG_BIN" ] && JTAG_BIN="$INSTALL_DIR/src/jtag"
+if [ -x "$JTAG_BIN" ] && [ "$HEALTH_OK" -eq 1 ]; then
+  info "Waiting for seed to populate default room (timeout ${SEED_TIMEOUT_SEC}s)..."
+  SEED_OK=0
+  for i in $(seq 1 "$SEED_TIMEOUT_SEC"); do
+    # data/list returns success+items when the room exists. Empty items
+    # means seed hasn't created it yet.
+    if "$JTAG_BIN" data/list --collection=rooms --filter='{"uniqueId":"general"}' --limit=1 2>/dev/null \
+       | grep -q '"success":true.*"items":\[{'; then
+      SEED_OK=1
+      ok "default room seeded after ${i}s"
+      break
+    fi
+    sleep 1
+  done
+  if [ "$SEED_OK" -ne 1 ]; then
+    warn "general room not present after ${SEED_TIMEOUT_SEC}s — seed may have failed."
+    warn "  Chat will return 'Room not found' until seed completes."
+    warn "  Diagnose: $CONTAINER_CMD compose -f $INSTALL_DIR/docker-compose.yml logs node-server | tail -50"
+  fi
+fi
+
 # ── 9. Determine URL + open browser (only if healthy) ──────
 PHASE="open browser"
 if [ -n "$TS_HOSTNAME" ] && [ -f "$CONTINUUM_DATA/$TS_HOSTNAME.crt" ]; then

diff --git a/scripts/ci/carl-install-smoke.sh b/scripts/ci/carl-install-smoke.sh
@@ -48,6 +48,19 @@ echo "━━━━━━━━━━━━━━━━━━━━━━━━
 
 teardown() {
   local rc=$?
+  # Capture per-container docker logs BEFORE `docker compose down` kills
+  # the containers and makes their logs unrecoverable. Without this the
+  # workflow's `if: failure()` step fires after smoke exit when containers
+  # are already gone — exactly the silent-evidence-loss the per-container
+  # logs are supposed to prevent. Capture on every exit (success or
+  # failure) since the file glob in the workflow upload is failure-only.
+  if [ -d "$CARL_INSTALL_DIR" ] && [ -f "$CARL_INSTALL_DIR/docker-compose.yml" ]; then
+    for svc in continuum-core node-server model-init widget-server livekit-bridge; do
+      ( cd "$CARL_INSTALL_DIR" && docker compose logs --no-color --timestamps "$svc" \
+        > "${CARL_INSTALL_DIR}.${svc}.log" 2>&1 ) || true
+    done
+    ( cd "$CARL_INSTALL_DIR" && docker compose ps -a > "${CARL_INSTALL_DIR}.compose-ps.log" 2>&1 ) || true
+  fi
   if [ "$SKIP_TEARDOWN" != "1" ] && [ -d "$CARL_INSTALL_DIR" ]; then
     echo ""
     echo "━━━ tearing down $CARL_INSTALL_DIR ━━━"
@@ -167,6 +180,33 @@ done
 
 echo "✅ root page looks like real HTML (${ROOT_BYTES} bytes, no failure markers)"
 
+# ── 3b. Headless screenshot — what Carl ACTUALLY sees in the browser ──
+# curl gives the server-rendered HTML shell. The chat UI itself loads via
+# JS — could be a blank chat with no personas or an empty room and curl
+# wouldn't catch it. Use chromium headless to capture what a real browser
+# renders. Wait a few seconds for the JS to populate tabs, personas,
+# rooms before snapping. Continue on screenshot failure (chrome may not
+# be on the PATH for non-CI runs); this is diagnostic, not gating.
+PAGE_PNG="${CARL_INSTALL_DIR}.page.png"
+CHROME_BIN="$(command -v google-chrome || command -v chromium || command -v chromium-browser || true)"
+if [ -n "$CHROME_BIN" ]; then
+  echo ""
+  echo "━━━ headless screenshot via $CHROME_BIN (waits 8s for JS to render) ━━━"
+  sleep 8
+  "$CHROME_BIN" --headless --disable-gpu --no-sandbox --hide-scrollbars \
+    --window-size=1280,1024 \
+    --screenshot="$PAGE_PNG" \
+    --virtual-time-budget=8000 \
+    "http://localhost:9003/" >/dev/null 2>&1 || true
+  if [ -f "$PAGE_PNG" ]; then
+    echo "  ✓ screenshot saved: $PAGE_PNG ($(stat -c%s "$PAGE_PNG" 2>/dev/null || stat -f%z "$PAGE_PNG") bytes)"
+  else
+    echo "  ⚠ screenshot capture failed (non-fatal)"
+  fi
+else
+  echo "  ⚠ no chromium/chrome on PATH — skipping browser screenshot"
+fi
+
 # ── 4. End-to-end chat: Carl types a message, expects an AI reply ─────
 # Per Joel's "OOTB on MacBook Air, free, accessible" + "canary e2e
 # working from curl, Carl's case" — page-render is necessary but not