wendylabsinc · EBro912 · Mar 30, 2026 · Mar 31, 2026
diff --git a/deepstream-vision/detector/.dockerignore b/deepstream-vision/detector/.dockerignore
@@ -1,4 +1,4 @@
 # Extraction script (not needed in container)
 extract_engine.sh
 
-# Note: Pre-built engine files (*.engine) are now included in the image for fast startup
+# Pre-built engine files are device-specific and auto-generated at first startup
diff --git a/deepstream-vision/detector/Dockerfile b/deepstream-vision/detector/Dockerfile
@@ -130,10 +130,8 @@ COPY --from=builder /compiled/yolo11n.onnx /app/
 # Copy labels from the export (auto-generated by export script)
 COPY --from=builder /compiled/labels.txt /app/labels_exported.txt
 
-# Pre-built TensorRT engine for fast startup (skip 8+ minute build time)
-# Engine was built using NvDsInferYoloCudaEngineGet from the custom YOLO library
-# on Jetson Orin with DeepStream 7.1 + CUDA 12.6 + TensorRT 10.x
-COPY model_b2_gpu0_fp16.engine /app/
+# TensorRT engine is auto-generated by nvinfer on first startup from yolo11n.onnx
+# using the NvDsInferYoloCudaEngineGet custom engine builder (~5-10 min on first run)
 
 # Copy application files
 COPY detector.py /app/

diff --git a/deepstream-vision/vlm/Dockerfile b/deepstream-vision/vlm/Dockerfile
@@ -1,6 +1,6 @@
 # Qwen3-VL Vision-Language Model Service
 # Optimized for Jetson Orin Nano with INT4 quantization using Jetson-compatible bitsandbytes
-# Model is baked into the image for fast startup (no downloads needed)
+# Model is downloaded on first startup via HuggingFace Hub (~4GB)
 
 FROM ubuntu:22.04
 
@@ -17,9 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libgomp1 \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy pre-downloaded Qwen3-VL-2B model EARLY for better layer caching
-# Download once with: huggingface-cli download Qwen/Qwen3-VL-2B-Instruct --local-dir ./models/Qwen3-VL-2B-Instruct
-COPY models/Qwen3-VL-2B-Instruct /app/models/Qwen3-VL-2B-Instruct
+# Model is downloaded at runtime on first startup via HuggingFace Hub
+# (Qwen/Qwen3-VL-2B-Instruct, ~4GB, public - no token needed)
 
 # Upgrade pip
 RUN pip3 install --no-cache-dir --upgrade pip

diff --git a/deepstream-vision/vlm/qwen3_service.py b/deepstream-vision/vlm/qwen3_service.py
@@ -72,7 +72,7 @@ def add_cors_headers(response):
 model_loaded = False
 
 # Configuration
-MODEL_PATH = "/app/models/Qwen3-VL-2B-Instruct"
+MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
 MODEL_NAME = "Qwen3-VL-2B-Instruct"
 MAX_IMAGE_SIZE = 672  # Resize large images for faster processing
 
@@ -100,7 +100,7 @@ def load_model():
         processor = AutoProcessor.from_pretrained(
             MODEL_PATH,
             trust_remote_code=True,
-            local_files_only=True
+            local_files_only=False
         )
         logger.info("Processor loaded successfully")
 
@@ -130,7 +130,7 @@ def load_model():
             "trust_remote_code": True,
             "quantization_config": quantization_config,
             "device_map": 'cuda',
-            "local_files_only": True,
+            "local_files_only": False,
         }
         if attn_impl:
             model_kwargs["attn_implementation"] = attn_impl

diff --git a/python/pipecat-assistant/Dockerfile b/python/pipecat-assistant/Dockerfile
@@ -29,8 +29,7 @@ RUN uv pip install --system -r requirements.txt
 COPY server/app.py ./
 COPY server/sounds ./sounds
 
-# Copy .env file (contains API keys)
-COPY server/.env ./
+# API keys (DEEPGRAM_API_KEY, XAI_API_KEY) are provided as runtime environment variables
 
 # Copy the built frontend
 COPY --from=frontend-builder /app/frontend/dist ./frontend/dist