redhat-ai-dev · Jdubrick · Jan 14, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/Makefile b/Makefile
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-RAG_CONTENT_IMAGE ?= quay.io/redhat-ai-dev/rag-content:release-1.8-lcs
+RAG_CONTENT_IMAGE ?= quay.io/redhat-ai-dev/rag-content:experimental-release-1.8-lcs
 VENV := $(CURDIR)/scripts/python-scripts/.venv
 PYTHON := $(VENV)/bin/python3
 PIP := $(VENV)/bin/pip3
@@ -36,9 +36,8 @@ help: ## Show this help screen
 		awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-33s\033[0m %s\n", $$1, $$2}'
 	@echo ''
 
-# TODO (Jdubrick): Replace reference to lightspeed-core/lightspeed-providers once bug is addressed.
 update-question-validation:
-	curl -o ./config/providers.d/inline/safety/lightspeed_question_validity.yaml https://raw.githubusercontent.com/Jdubrick/lightspeed-providers/refs/heads/devai/resources/external_providers/inline/safety/lightspeed_question_validity.yaml
+	curl -o ./config/providers.d/inline/safety/lightspeed_question_validity.yaml https://raw.githubusercontent.com/lightspeed-core/lightspeed-providers/refs/tags/0.1.17/resources/external_providers/inline/safety/lightspeed_question_validity.yaml
 
 $(VENV)/bin/activate: ./scripts/python-scripts/requirements.txt
 	python3 -m venv $(VENV)

diff --git a/README.md b/README.md
@@ -1,46 +1,48 @@
 # Redhat-AI-Dev Llama Stack
 
 [![Apache2.0 License](https://img.shields.io/badge/license-Apache2.0-brightgreen.svg)](LICENSE)
+[![Llama Stack Version](https://img.shields.io/badge/llama_stack-v0.3.5-blue)](https://llamastack.github.io/docs/v0.3.5)
+[![Python Version](https://img.shields.io/badge/python-3.12-blue)](https://www.python.org/downloads/release/python-3120/)
 
 - [Image Availability](#image-availability)
+  - [Latest Stable Release](#latest-stable-release)
+  - [Latest Developer Release](#latest-developer-release)
 - [Usage](#usage)
   - [Available Inferences](#available-inferences)
     - [vLLM](#vllm)
     - [Ollama](#ollama)
     - [OpenAI](#openai)
+    - [Vertex AI (Gemini)](#vertex-ai-gemini)
   - [Configuring RAG](#configuring-rag)
-  - [Configuring Question Validation](#configuring-question-validation)
-  - [Running Locally](#running-locally)
-  - [Running on a Cluster](#running-on-a-cluster)
+  - [Configuring Safety Guards](#configuring-safety-guards)
+- [Running Locally](#running-locally)
+- [Running on a Cluster](#running-on-a-cluster)
 - [Makefile Commands](#makefile-commands)
 - [Contributing](#contributing)
+  - [Local Development Requirements](#local-development-requirements)
+  - [Updating YAML Files](#updating-yaml-files)
 - [Troubleshooting](#troubleshooting)
 
-## Image Availability
+# Image Availability
 
-### Latest Stable Release
+## Latest Stable Release
 
 ```
 quay.io/redhat-ai-dev/llama-stack:0.1.1
 ```
 
-### Latest Developer Release
+## Latest Developer Release
 
 ```
 quay.io/redhat-ai-dev/llama-stack:latest
 ```
 
-## Usage
+# Usage
 
 > [!IMPORTANT]
 > The default Llama Stack configuration file that is baked into the built image contains tools. Ensure your provided inference server has tool calling **enabled**.
 
-**Note:** You can enable `DEBUG` logging by setting:
-```
-LLAMA_STACK_LOGGING=all=DEBUG
-```
-
-### Available Inferences
+## Available Inferences
 
 Each inference has its own set of environment variables. You can include all of these variables in a `.env` file and pass that instead to your container. See [default-values.env](./env/default-values.env) for a template. It is recommended you copy that file to `values.env` to avoid committing it to Git.
 
@@ -51,7 +53,7 @@ Each inference has its own set of environment variables. You can include all of
 > 
 > VLLM_API_KEY="token" ❌
 
-#### vLLM
+### vLLM
 
 **Required**
 ```env
@@ -65,7 +67,7 @@ VLLM_MAX_TOKENS=<defaults to 4096>
 VLLM_TLS_VERIFY=<defaults to true>
 ```
 
-#### Ollama
+### Ollama
 
 **Required**
 ```env
@@ -77,7 +79,7 @@ The value of `OLLAMA_URL` is the default `http://localhost:11434`, when you are
 
 The value of `OLLAMA_URL` is `http://host.containers.internal:11434` if you are running llama-stack inside a container i.e.; if you run llama-stack with the podman run command above, it needs to access the Ollama endpoint on your laptop not inside the container. **If you are using Linux**, ensure your firewall allows port 11434 to your podman container's network, some Linux distributions firewalls block all traffic by default. Alternatively you can use `OLLAMA_URL=http://localhost:11434` and set the `--network host` flag when you run your podman container.
 
-#### OpenAI
+### OpenAI
 
 **Required**
 ```env
@@ -87,7 +89,7 @@ OPENAI_API_KEY=<your-api-key>
 
 To get your API Key, go to [platform.openai.com](https://platform.openai.com/settings/organization/api-keys).
 
-#### Vertex AI (Gemini)
+### Vertex AI (Gemini)
 
 **Required**
 ```env
@@ -99,7 +101,7 @@ GOOGLE_APPLICATION_CREDENTIALS=
 
 For information about these variables see: https://llamastack.github.io/v0.2.18/providers/inference/remote_vertexai.html.
 
-### Configuring RAG
+## Configuring RAG
 
 The `run.yaml` file that is included in the container image has a RAG tool enabled. In order for this tool to have the necessary reference content, you need to run:
 
@@ -109,25 +111,38 @@ make get-rag
 
 This will fetch the necessary reference content and add it to your local project directory.
 
-### Configuring Question Validation
+## Configuring Safety Guards
+
+> [!IMPORTANT]
+> If you want to omit the safety guards for development purposes, you can use [run-no-guard.yaml](./run-no-guard.yaml) instead.
+
+In the main [run.yaml](./run.yaml) file, Llama Guard is enabled by default. In order to avoid issues during startup you will need to ensure you have an instance of Llama Guard running.
 
-By default this Llama Stack has a Safety Shield for question validation enabled. You will need to set the following environment variables to ensure functionality:
+You can do so by running the following to start an Ollama container with Llama Guard:
 
-- `VALIDATION_PROVIDER`: The provider you want to use for question validation. This should match what the provider value you are using under `inference`, such as `vllm`, `ollama`, `openai`. Defaults to `vllm`
-- `VALIDATION_MODEL_NAME`: The name of the LLM you want to use for question validation
+```sh
+podman run -d --name ollama -p 11434:11434 docker.io/ollama/ollama:latest
+podman exec ollama ollama pull llama-guard3:8b
+```
+**Note:** Ensure the Ollama container is started and the model is ready before trying to query if deploying the containers manually.
 
-### Running Locally
+You will need to set the following environment variables to ensure functionality:
+- `SAFETY_MODEL`: The name of the Llama Guard model being used. Defaults to `llama-gaurd3:8b`
+- `SAFETY_URL`: The URL where the container is available. Defaults to `http://host.docker.internal:11434/v1`
+- `SAFETY_API_KEY`: The API key required for access to the safety model. Not required for local.
+
+# Running Locally
 
 ```
-podman run -it -p 8321:8321 --env-file ./env/values.env -v ./embeddings_model:/app-root/embeddings_model:Z -v ./vector_db/rhdh_product_docs:/app-root/vector_db/rhdh_product_docs:Z quay.io/redhat-ai-dev/llama-stack:latest
+podman run -it -p 8321:8321 --env-file ./env/values.env -v ./embeddings_model:/rag-content/embeddings_model:Z -v ./vector_db/rhdh_product_docs:/rag-content/vector_db/rhdh_product_docs:Z quay.io/redhat-ai-dev/llama-stack:latest
 ```
 
 Or if using the host network:
 ```
-podman run -it -p 8321:8321 --env-file ./env/values.env --network host -v ./embeddings_model:/app-root/embeddings_model:Z -v ./vector_db/rhdh_product_docs:/app-root/vector_db/rhdh_product_docs:Z quay.io/redhat-ai-dev/llama-stack:latest
+podman run -it -p 8321:8321 --env-file ./env/values.env --network host -v ./embeddings_model:/rag-content/embeddings_model:Z -v ./vector_db/rhdh_product_docs:/rag-content/vector_db/rhdh_product_docs:Z quay.io/redhat-ai-dev/llama-stack:latest
 ```
 
-Latest Lightspeed Core developer image:
+Latest Lightspeed Core Developer Image:
 ```
 quay.io/lightspeed-core/lightspeed-stack:dev-latest
 ```
@@ -139,7 +154,7 @@ podman run -it -p 8080:8080 -v ./lightspeed-stack.yaml:/app-root/lightspeed-stac
 
 **Note:** If you have built your own version of Lightspeed Core you can replace the image referenced with your own build. Additionally, you can use the Llama Stack container along with the `lightspeed-stack.yaml` file to run Lightspeed Core locally with `uv` from their [repository](https://github.com/lightspeed-core/lightspeed-stack).
 
-### Running on a Cluster
+# Running on a Cluster
 
 To deploy on a cluster see [DEPLOYMENT.md](./docs/DEPLOYMENT.md).
 
@@ -149,17 +164,17 @@ To deploy on a cluster see [DEPLOYMENT.md](./docs/DEPLOYMENT.md).
 | ---- | ----|
 | **get-rag** | Gets the RAG data and the embeddings model from the rag-content image registry to your local project directory |
 | **update-question-validation** | Updates the question validation content in `providers.d` |
-| **validate-prompt-templates** | Validates prompt values in run.yaml. **Requires Python >= 3.11** |
-| **update-prompt-templates** | Updates the prompt values in run.yaml. **Requires Python >= 3.11** |
+| **validate-prompt-templates** | Validates prompt values in run.yaml. |
+| **update-prompt-templates** | Updates the prompt values in run.yaml. |
 
-## Contributing
+# Contributing
 
-### Local Development Requirements
+## Local Development Requirements
 
 - [Yarn](https://yarnpkg.com/)
 - [Node.js >= v22](https://nodejs.org/en/about/previous-releases)
 
-### Updating YAML Files
+## Updating YAML Files
 
 This repository implements Prettier to handle all YAML formatting.
 ```sh
@@ -169,7 +184,13 @@ yarn verify # Runs Prettier to check the YAML files in this repository
 
 If you wish to try new changes with Llama Stack, you can build your own image using the `Containerfile` in the root of this repository.
 
-## Troubleshooting
+# Troubleshooting
+
+>[!NOTE]
+> You can enable `DEBUG` logging by setting:
+>```
+>LLAMA_STACK_LOGGING=all=DEBUG
+>```
 
 If you experience an error related to permissions for the `vector_db`, such as:
 

diff --git a/env/default-values.env b/env/default-values.env
@@ -35,5 +35,13 @@ OLLAMA_URL=
 VALIDATION_PROVIDER=
 VALIDATION_MODEL_NAME=
 
+# Llama Guard Settings
+## Defaults to llama-guard3:8b if not set
+SAFETY_MODEL=
+## Defaults to http://host.docker.internal:11434/v1 if not set
+SAFETY_URL=
+## Only required for non-local environments with a api key
+SAFETY_API_KEY=
+
 # Other
 LLAMA_STACK_LOGGING=
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,10 +7,10 @@ dependencies = [
     "fastapi>=0.115.6",
     "uvicorn>=0.34.3",
     "kubernetes>=30.1.0",
-    "llama-stack==0.2.18",
-    "llama-stack-client==0.2.18",
+    "llama-stack==0.3.5",
+    "llama-stack-client==0.3.5",
     "ollama>=0.2.0",
-    "openai==1.99.9",
+    "openai>=1.100.0",
     "rich>=14.0.0",
     "cachetools>=6.1.0",
     "prometheus-client>=0.22.1",
@@ -34,9 +34,9 @@ dependencies = [
     "greenlet",
     "torch",
     "sentence-transformers>=5.0.0",
-    "lightspeed_stack_providers @ git+https://github.com/Jdubrick/lightspeed-providers.git@devai",
     "pydantic>=2.10.6",
     "httpx",
+    "chardet",
 ]
 requires-python = "==3.12.*"
 readme = "README.md"

diff --git a/run-no-guard.yaml b/run-no-guard.yaml
@@ -0,0 +1,131 @@
+#
+#
+# Copyright Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+version: 2
+image_name: redhat-ai-dev-llama-stack-no-guard
+apis:
+  - agents
+  - inference
+  - safety
+  - tool_runtime
+  - vector_io
+  - files
+container_image:
+external_providers_dir:
+providers:
+  agents:
+    - config:
+        persistence:
+          agent_state:
+            namespace: agents
+            backend: kv_default
+          responses:
+            table_name: responses
+            backend: sql_default
+      provider_id: meta-reference
+      provider_type: inline::meta-reference
+  inference:
+    - provider_id: ${env.ENABLE_VLLM:+vllm}
+      provider_type: remote::vllm
+      config:
+        url: ${env.VLLM_URL:=}
+        api_token: ${env.VLLM_API_KEY:=}
+        max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+        tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+    - provider_id: ${env.ENABLE_OLLAMA:+ollama}
+      provider_type: remote::ollama
+      config:
+        url: ${env.OLLAMA_URL:=http://localhost:11434}
+    - provider_id: ${env.ENABLE_OPENAI:+openai}
+      provider_type: remote::openai
+      config:
+        api_key: ${env.OPENAI_API_KEY:=}
+    - provider_id: ${env.ENABLE_VERTEX_AI:+vertexai}
+      provider_type: remote::vertexai
+      config:
+        project: ${env.VERTEX_AI_PROJECT:=}
+        location: ${env.VERTEX_AI_LOCATION:=us-central1}
+    - provider_id: sentence-transformers
+      provider_type: inline::sentence-transformers
+      config: {}
+  tool_runtime:
+    - provider_id: model-context-protocol
+      provider_type: remote::model-context-protocol
+      config: {}
+    - provider_id: rag-runtime
+      provider_type: inline::rag-runtime
+      config: {}
+  vector_io:
+    - provider_id: faiss
+      provider_type: inline::faiss
+      config:
+        persistence:
+          namespace: vector_io::faiss
+          backend: faiss_kv
+  files:
+    - provider_id: localfs
+      provider_type: inline::localfs
+      config:
+        storage_dir: /tmp/llama-stack-files
+        metadata_store:
+          table_name: files_metadata
+          backend: sql_default
+storage:
+  backends:
+    kv_default:
+      type: kv_sqlite
+      db_path: /tmp/kvstore.db
+    sql_default:
+      type: sql_sqlite
+      db_path: /tmp/sql_store.db
+    faiss_kv:
+      type: kv_sqlite
+      db_path: /rag-content/vector_db/rhdh_product_docs/1.8/faiss_store.db
+  stores:
+    metadata:
+      namespace: registry
+      backend: faiss_kv
+    inference:
+      table_name: inference_store
+      backend: sql_default
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_default
+registered_resources:
+  models:
+    - model_id: sentence-transformers/all-mpnet-base-v2
+      metadata:
+        embedding_dimension: 768
+      model_type: embedding
+      provider_id: sentence-transformers
+      provider_model_id: /rag-content/embeddings_model
+  tool_groups:
+    - provider_id: rag-runtime
+      toolgroup_id: builtin::rag
+  vector_dbs:
+    - vector_db_id: rhdh-product-docs-1_8
+      embedding_model: sentence-transformers/all-mpnet-base-v2
+      embedding_dimension: 768
+      provider_id: faiss
+server:
+  auth:
+  host:
+  port: 8321
+  quota:
+  tls_cafile:
+  tls_certfile:
+  tls_keyfile: