From d68922ac52911d32df4b2d6ae065d5164acf3df3 Mon Sep 17 00:00:00 2001
From: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
Date: Tue, 21 Apr 2026 14:57:16 -0500
Subject: [PATCH 1/8] feat: add llama-3.1-8b-instruct model card and deployment
 guide for Dell EI

Signed-off-by: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
---
 .../llama-3.1-8b-instruct/deployment.md       | 71 ++++++++++++++++++
 .../llama-3.1-8b-instruct/model-card.md       | 64 ++++++++++++++++
 .../llama-3.1-8b-instruct/xeon-deployment.md  | 73 +++++++++++++++++++
 3 files changed, 208 insertions(+)
 create mode 100644 third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
 create mode 100644 third_party/Dell/model-deployment/llama-3.1-8b-instruct/model-card.md
 create mode 100644 third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md

diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
new file mode 100644
index 00000000..608e7d83
--- /dev/null
+++ b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
@@ -0,0 +1,71 @@
+# Deployed with EI Version-1.2
+
+## Step 1: Set Environment Variables
+
+```bash
+# Export Hugging Face token
+export HUGGING_FACE_HUB_TOKEN="your_token_here"
+
+# Set your base URL and API token
+export BASE_HOST="your-cluster-url"
+
+#generate keyclock token
+export BASE_URL="https://your-cluster-url"
+export KEYCLOAK_CLIENT_ID=api
+export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
+export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
+```
+
+## Step 2: Deploy Llama-3.1-8B-Instruct Model
+
+```bash
+helm install vllm-llama-8b ./core/helm-charts/vllm \
+--values ./core/helm-charts/vllm/gaudi3-values.yaml \
+--set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" \
+--set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+--set ingress.enabled=true \
+--set ingress.host="$BASE_HOST" \
+--set ingress.secretname="$BASE_HOST" \
+--force
+```
+
+## Step 3: Test the Deployed Model
+
+```bash
+curl -k ${BASE_URL}/Llama-3.1-8B-Instruct/v1/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $TOKEN" \
+  -d '{
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "prompt": "What is Deep Learning?",
+    "max_tokens": 25,
+    "temperature": 0
+  }'
+```
+## To undeploy the model
+
+```bash
+helm uninstall vllm-llama-8b
+```
+## Parameters
+
+| Parameter                                                 | Description                                                                                           |
+| --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"`   | Defines the target model from **Hugging Face** to deploy.                                             |
+| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`             | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
+| `--set ingress.enabled=true`                              | Enables Kubernetes **Ingress** to expose the model service externally.                                |
+| `--set ingress.host="replace-ingress"`                    | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
+| `--set ingress.secretname="replace-secret"`               | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/model-card.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/model-card.md
new file mode 100644
index 00000000..b6f1a3e9
--- /dev/null
+++ b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/model-card.md
@@ -0,0 +1,64 @@
+# Llama-3.1-8B-Instruct
+
+This model uses Llama-3.1-8B-Instruct, a 8 billion-parameter instruction-tuned model from Meta Platforms, Inc. (Meta AI). It belongs to the Llama 3.1 model family and is optimized for multilingual dialogue, code tasks, and general instruction-following across a large context window.
+
+For full details including model specifications, licensing, intended use, safety guidance, and example prompts, please visit the official Hugging Face page: **Official Hugging Face Page**
+
+https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+
+This model provides inference services only; weights are hosted by Hugging Face under Meta’s license.
+
+Ensure compliance with the Llama 2 Community License Agreement before using this model.
+
+### Model Attribution
+
+**Developer:**	Meta Platforms, Inc. (Meta AI)
+
+**purpose:** Instruction-following model for dialogue, code generation/completion, multilingual tasks
+
+**Sizes/Variants:**	8 B parameters (instruction tuned); the Llama 3.1 family also includes 70 B and 405 B parameter variants
+
+**Modalities:**	Text input → Text (including code) output
+
+**Parameter Size:** ~8 billion
+
+**Max Context:**	Up to ~128 k tokens (for the 3.1 family)
+
+**License:** Llama 3.1 Community License (custom commercial license)
+
+**Minimum required CPU Cores:** 157
+
+**Minimum required PCIe Cards:** 1
+
+### Usage Notice
+
+**By using this model, you agree that:**
+
+- Inputs and outputs are processed through Llama-3.1-8B-Instruct under Meta’s Community License.
+- You will comply with Meta’s licensing terms, including restrictions on redistribution, commercial scale-use thresholds, attribution (“Built with Llama”), and acceptable use policy.
+- All generated content (text or code) must be reviewed for accuracy, compliance, and safety before deployment.
+- The model should not be used for generating malicious content, disallowed content, or automating decisions in high-risk or regulated systems without appropriate safeguards.
+
+### Intended Applications
+
+- Instruction-following chatbots and assistants (multilingual)
+- Code generation, completion, refactoring tasks (Python, Java, JavaScript, etc.)
+- Multilingual support (English, German, French, Italian, Portuguese, Hindi, Spanish, Thai) and potentially others with fine-tuning.
+- Large-context tasks: summarization of long documents, dialog over long history, RAG (retrieve-and-generate) over extended context.
+- Research, prototyping, and commercial workflows (subject to license terms).
+
+### Limitations
+
+- Although capable, the 8 B size still has trade-offs: accuracy and depth of reasoning may lag behind much larger models.
+- As with all large language models, risk of hallucinations (incorrect statements), biases, or unsafe outputs remains.
+- The custom license restricts certain uses (e.g., if your product has > 700 million monthly active users you may require a special license) as described in Meta’s license terms.
+- The model does not guarantee tool-use, vision/multimodal input (unless you fine-tune or wrap appropriately) – it is primarily text → text.
+- Running it efficiently still requires significant hardware/resources for full context and best performance
+
+### References
+
+“Introducing Llama 3.1: Our most capable models to date”. https://ai.meta.com/blog/meta-llama-3-1
+
+Hugging Face Model Card: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+
+Meta Llama GitHub Repository & License Details. https://github.com/meta-llama/llama3
diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md
new file mode 100644
index 00000000..52bf21ca
--- /dev/null
+++ b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md
@@ -0,0 +1,73 @@
+
+# Deployed with EI Version-1.3.1
+
+## Step 1: Set Environment Variables
+
+```bash
+# Export Hugging Face token
+export HUGGING_FACE_HUB_TOKEN="your_token_here"
+
+# Set your base URL and API token
+export BASE_HOST="your-cluster-url"
+
+#generate keyclock token
+export BASE_URL="https://your-cluster-url"
+export KEYCLOAK_CLIENT_ID=api
+export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
+export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
+```
+
+## Step 2: Deploy Llama-3.1-8B-Instruct Model
+
+```bash
+helm install vllm-llama-8b-cpu ./core/helm-charts/vllm \
+  --values ./core/helm-charts/vllm/xeon-values.yaml \
+  --set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" \
+  --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+  --set ingress.enabled=false \
+  --set ingress.host="${BASE_HOST}" \
+  --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
+  --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
+  --set apisix.enabled=true \
+  --set tensor_parallel_size="1" \
+  --set pipeline_parallel_size="1"
+```
+
+## Step 3: Test the Deployed Model
+
+```bash
+curl -k ${BASE_URL}/Llama-3.1-8B-Instruct-vllmcpu/v1/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $TOKEN" \
+  -d '{
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "prompt": "What is Deep Learning?",
+    "max_tokens": 25,
+    "temperature": 0
+  }'
+```
+
+## To undeploy the model
+
+```bash
+helm uninstall vllm-llama-8b-cpu
+```
+## Parameters
+
+| Parameter                                                 | Description                                                                                           |
+| --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"`   | Defines the target model from **Hugging Face** to deploy.                                             |
+| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`             | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
+| `--set ingress.enabled=true`                              | Enables Kubernetes **Ingress** to expose the model service externally.                                |
+| `--set ingress.host="replace-ingress"`                    | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
+| `--set ingress.secretname="replace-secret"`               | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
+
+
+
+
+
+
+
+
+

From 9e3c97d54088349056e8c5607e78f11baf76bb07 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 11:58:30 -0500
Subject: [PATCH 2/8] updated llama 3.1 8b instruct deployment.md

---
 .../llama-3.1-8b-instruct/deployment.md       | 172 ++++++++++--------
 1 file changed, 101 insertions(+), 71 deletions(-)

diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
index 608e7d83..7bb38d49 100644
--- a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
+++ b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
@@ -1,71 +1,101 @@
-# Deployed with EI Version-1.2
-
-## Step 1: Set Environment Variables
-
-```bash
-# Export Hugging Face token
-export HUGGING_FACE_HUB_TOKEN="your_token_here"
-
-# Set your base URL and API token
-export BASE_HOST="your-cluster-url"
-
-#generate keyclock token
-export BASE_URL="https://your-cluster-url"
-export KEYCLOAK_CLIENT_ID=api
-export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
-export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
-```
-
-## Step 2: Deploy Llama-3.1-8B-Instruct Model
-
-```bash
-helm install vllm-llama-8b ./core/helm-charts/vllm \
---values ./core/helm-charts/vllm/gaudi3-values.yaml \
---set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" \
---set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
---set ingress.enabled=true \
---set ingress.host="$BASE_HOST" \
---set ingress.secretname="$BASE_HOST" \
---force
-```
-
-## Step 3: Test the Deployed Model
-
-```bash
-curl -k ${BASE_URL}/Llama-3.1-8B-Instruct/v1/completions \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer $TOKEN" \
-  -d '{
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "prompt": "What is Deep Learning?",
-    "max_tokens": 25,
-    "temperature": 0
-  }'
-```
-## To undeploy the model
-
-```bash
-helm uninstall vllm-llama-8b
-```
-## Parameters
-
-| Parameter                                                 | Description                                                                                           |
-| --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
-| `--set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"`   | Defines the target model from **Hugging Face** to deploy.                                             |
-| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`             | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
-| `--set ingress.enabled=true`                              | Enables Kubernetes **Ingress** to expose the model service externally.                                |
-| `--set ingress.host="replace-ingress"`                    | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
-| `--set ingress.secretname="replace-secret"`               | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
-
-
-
-
-
-
-
-
-
-
-
-
+## Step 1: Prerequisites to Deploy Llama-3.1-8B-Instruct Model on Gaudi with Keycloak
+
+Ensure the Enterprise Inference stack with Keycloak is already deployed before proceeding.
+
+Edit `core/scripts/generate-token.sh` and set your values before sourcing it:
+
+| Variable                  | Description                                                              |
+| ------------------------- | ------------------------------------------------------------------------ |
+| `BASE_URL`                | Hostname of your cluster (e.g. `api.example.com`), without `https://`   |
+| `KEYCLOAK_ADMIN_USERNAME` | Keycloak admin username                                                  |
+| `KEYCLOAK_PASSWORD`       | Keycloak admin password                                                  |
+| `KEYCLOAK_CLIENT_ID`      | Keycloak client ID configured during EI deployment                       |
+
+Then run:
+
+```bash
+export HUGGING_FACE_HUB_TOKEN="your_token_here"
+
+cd ~/Enterprise-Inference
+source core/scripts/generate-token.sh
+```
+
+This exports: `BASE_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_CLIENT_SECRET`, and `TOKEN`.
+
+## Step 2: Deploy Llama-3.1-8B-Instruct Model
+
+```bash
+helm install vllm-llama-8b ./core/helm-charts/vllm \
+  --values ./core/helm-charts/vllm/gaudi3-values.yaml \
+  --set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" \
+  --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+  --set ingress.enabled=true \
+  --set ingress.secretname="${BASE_URL}" \
+  --set ingress.host="${BASE_URL}" \
+  --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
+  --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
+  --set apisix.enabled=true \
+  --set tensor_parallel_size="1" \
+  --set pipeline_parallel_size="1"
+```
+
+## Step 3: Verify the Deployment
+
+```bash
+kubectl get pods
+kubectl get apisixroutes
+```
+
+Expected Output:
+
+```
+NAME                                    READY   STATUS    RESTARTS
+keycloak-0                              1/1     Running   0
+keycloak-postgresql-0                   1/1     Running   0
+vllm-llama-8b-<hash>-<hash>             1/1     Running   0
+```
+
+> Note: The pod name suffix `<hash>-<hash>` is auto-generated by Kubernetes and will differ on each deployment. Ensure all pods show `1/1 Running`.
+
+```
+NAME                         HOSTS
+vllm-llama-8b-apisixroute    api.example.com
+```
+
+## Step 4: Test the Deployed Model
+
+```bash
+curl -k https://${BASE_URL}/Llama-3.1-8B-Instruct/v1/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $TOKEN" \
+  -d '{
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "prompt": "What is Deep Learning?",
+    "max_tokens": 25,
+    "temperature": 0
+  }'
+```
+
+If successful, the model will return a completion response.
+
+## To undeploy the model
+
+```bash
+helm uninstall vllm-llama-8b
+```
+
+## Parameters
+
+| Parameter                                                  | Description                                                                                       |
+| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| `--set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"`   | Defines the target model from **Hugging Face** to deploy.                                         |
+| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`              | Authenticates access to gated or private Hugging Face models. Replace with your own secure token. |
+| `--set ingress.enabled=true`                               | Enables Kubernetes **Ingress** to expose the model service externally.                            |
+| `--set ingress.host="${BASE_URL}"`                         | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).          |
+| `--set ingress.secretname="${BASE_URL}"`                   | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                        |
+| `--set oidc.client_id="..."`                               | Keycloak OIDC client ID used for token-based authentication.                                      |
+| `--set oidc.client_secret="..."`                           | Keycloak OIDC client secret corresponding to the client ID.                                       |
+| `--set apisix.enabled=true`                                | Enables **APISIX** as the API gateway for routing and authentication.                             |
+| `--set tensor_parallel_size="1"`                           | Number of tensor parallel workers. Set to the number of available Gaudi cards per node.           |
+| `--set pipeline_parallel_size="1"`                         | Number of pipeline parallel stages. Typically `1` for single-node deployments.                   |

From 806461e45e66b570bf504802b5927847b314afbe Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 12:17:48 -0500
Subject: [PATCH 3/8] updating llama 3.1 8b instruct deployment.md

---
 .../Dell/model-deployment/llama-3.1-8b-instruct/deployment.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
index 7bb38d49..4971d873 100644
--- a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
+++ b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
@@ -1,4 +1,4 @@
-## Step 1: Prerequisites to Deploy Llama-3.1-8B-Instruct Model on Gaudi with Keycloak
+## Step 1: Prerequisites to Deploy Llama-3.1-8B-Instruct Model on Xeon with Keycloak
 
 Ensure the Enterprise Inference stack with Keycloak is already deployed before proceeding.
 
@@ -26,7 +26,7 @@ This exports: `BASE_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_CLIENT_SECRET`, and `T
 
 ```bash
 helm install vllm-llama-8b ./core/helm-charts/vllm \
-  --values ./core/helm-charts/vllm/gaudi3-values.yaml \
+  --values ./core/helm-charts/vllm/xeon-values.yaml \
   --set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" \
   --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
   --set ingress.enabled=true \

From 7bcf2bc0fe65f7b3f73cf4aaa8d42d138dbf1353 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 12:28:28 -0500
Subject: [PATCH 4/8] update llama 3.1 8b instruct deployment.md

---
 .../Dell/model-deployment/llama-3.1-8b-instruct/deployment.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
index 4971d873..b3c94140 100644
--- a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
+++ b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/deployment.md
@@ -65,7 +65,7 @@ vllm-llama-8b-apisixroute    api.example.com
 ## Step 4: Test the Deployed Model
 
 ```bash
-curl -k https://${BASE_URL}/Llama-3.1-8B-Instruct/v1/completions \
+curl -k https://${BASE_URL}/Llama-3.1-8B-Instruct-vllmcpu/v1/completions \
   -X POST \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer $TOKEN" \

From 8eaa07807a17d8d388102c1f2e92c9e90ea14924 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 26 May 2026 09:21:45 -0500
Subject: [PATCH 5/8] Add model deployment troubleshooting guide for 504
 gateway timeout

---
 .../Dell/model-deployment/troubleshooting.md  | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 third_party/Dell/model-deployment/troubleshooting.md

diff --git a/third_party/Dell/model-deployment/troubleshooting.md b/third_party/Dell/model-deployment/troubleshooting.md
new file mode 100644
index 00000000..15cd76b2
--- /dev/null
+++ b/third_party/Dell/model-deployment/troubleshooting.md
@@ -0,0 +1,79 @@
+# Troubleshooting Guide
+
+This section provides common issues observed when running inference against models deployed via Helm commands on Intel® AI for Enterprise Inference — along with step-by-step resolutions.
+
+**Issues:**
+  1. [Gateway Timeout (504) on Inference Requests](#1-gateway-timeout-504-on-inference-requests)
+
+---
+
+### 1. Gateway Timeout (504) on Inference Requests
+
+**Context:** Model deployed via Helm commands. Inference request sent through the ingress stack (ingress-nginx → APISIX → vLLM service).
+
+**Error:** Inference requests return `504 Gateway Timeout` after 60 seconds:
+
+```
+"POST /<model-name>/v1/completions HTTP/2.0" 504
+upstream timed out (110: Operation timed out) ... 60.001
+```
+
+**Cause:**
+
+CPU-based model inference (`vllm-cpu`) generates tokens at ~0.3–0.4 tokens/s. Responses requiring more than ~24 tokens exceed the default 60s upstream timeout enforced by ingress-nginx and APISIX.
+
+**Fix:**
+
+**Step 1 — Increase the nginx ingress timeout**
+
+Apply to both the `default` and `auth-apisix` namespaces. To find ingress names:
+
+```bash
+kubectl get ingress -A | grep <model-name>
+```
+
+Then annotate each ingress:
+
+```bash
+kubectl annotate ingress <ingress-name> -n <namespace> \
+  nginx.ingress.kubernetes.io/proxy-read-timeout="300" \
+  nginx.ingress.kubernetes.io/proxy-send-timeout="300" \
+  nginx.ingress.kubernetes.io/proxy-connect-timeout="60" \
+  --overwrite
+```
+
+**Step 2 — Increase the APISIX route timeout**
+
+To find the route name:
+
+```bash
+kubectl get apisixroute -n auth-apisix | grep <model-name>
+```
+
+Edit the route:
+
+```bash
+kubectl edit apisixroute <route-name> -n auth-apisix
+```
+
+Update the timeout section under the route:
+
+```yaml
+spec:
+  http:
+    - name: <route-name>
+      timeout:
+        connect: 60s
+        send: 300s
+        read: 300s
+```
+
+**Verification:**
+
+Re-run the inference request and confirm a `200 OK` response is returned within the new timeout window.
+
+**Notes:**
+
+- The nginx ingress annotation takes effect immediately — no pod restart required.
+- For GPU-based deployments this timeout is rarely needed as throughput is significantly higher (30–50 tokens/s vs 0.3–0.4 tokens/s on CPU).
+- If requests still time out after increasing both timeouts, reduce `max_tokens` in the request payload to limit response length.

From 3410c1d2fe076c442e474fc16e1e3db74fae0663 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 26 May 2026 09:27:15 -0500
Subject: [PATCH 6/8] Remove em dashes from troubleshooting guide

---
 third_party/Dell/model-deployment/troubleshooting.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/Dell/model-deployment/troubleshooting.md b/third_party/Dell/model-deployment/troubleshooting.md
index 15cd76b2..68d4b7be 100644
--- a/third_party/Dell/model-deployment/troubleshooting.md
+++ b/third_party/Dell/model-deployment/troubleshooting.md
@@ -1,6 +1,6 @@
 # Troubleshooting Guide
 
-This section provides common issues observed when running inference against models deployed via Helm commands on Intel® AI for Enterprise Inference — along with step-by-step resolutions.
+This section provides common issues observed when running inference against models deployed via Helm commands on Intel® AI for Enterprise Inference, along with step-by-step resolutions.
 
 **Issues:**
   1. [Gateway Timeout (504) on Inference Requests](#1-gateway-timeout-504-on-inference-requests)
@@ -24,7 +24,7 @@ CPU-based model inference (`vllm-cpu`) generates tokens at ~0.3–0.4 tokens/s.
 
 **Fix:**
 
-**Step 1 — Increase the nginx ingress timeout**
+**Step 1 - Increase the nginx ingress timeout**
 
 Apply to both the `default` and `auth-apisix` namespaces. To find ingress names:
 
@@ -42,7 +42,7 @@ kubectl annotate ingress <ingress-name> -n <namespace> \
   --overwrite
 ```
 
-**Step 2 — Increase the APISIX route timeout**
+**Step 2 - Increase the APISIX route timeout**
 
 To find the route name:
 
@@ -74,6 +74,6 @@ Re-run the inference request and confirm a `200 OK` response is returned within
 
 **Notes:**
 
-- The nginx ingress annotation takes effect immediately — no pod restart required.
+- The nginx ingress annotation takes effect immediately; no pod restart required.
 - For GPU-based deployments this timeout is rarely needed as throughput is significantly higher (30–50 tokens/s vs 0.3–0.4 tokens/s on CPU).
 - If requests still time out after increasing both timeouts, reduce `max_tokens` in the request payload to limit response length.

From 1d52bc55a224e68467a796c235b7f4f1f410857e Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 26 May 2026 09:44:20 -0500
Subject: [PATCH 7/8] update troubleshooting.md

---
 .../llama-3.1-8b-instruct/xeon-deployment.md  | 73 -------------------
 .../Dell/model-deployment/troubleshooting.md  |  6 +-
 2 files changed, 3 insertions(+), 76 deletions(-)
 delete mode 100644 third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md

diff --git a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md b/third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md
deleted file mode 100644
index 52bf21ca..00000000
--- a/third_party/Dell/model-deployment/llama-3.1-8b-instruct/xeon-deployment.md
+++ /dev/null
@@ -1,73 +0,0 @@
-
-# Deployed with EI Version-1.3.1
-
-## Step 1: Set Environment Variables
-
-```bash
-# Export Hugging Face token
-export HUGGING_FACE_HUB_TOKEN="your_token_here"
-
-# Set your base URL and API token
-export BASE_HOST="your-cluster-url"
-
-#generate keyclock token
-export BASE_URL="https://your-cluster-url"
-export KEYCLOAK_CLIENT_ID=api
-export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
-export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
-```
-
-## Step 2: Deploy Llama-3.1-8B-Instruct Model
-
-```bash
-helm install vllm-llama-8b-cpu ./core/helm-charts/vllm \
-  --values ./core/helm-charts/vllm/xeon-values.yaml \
-  --set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct" \
-  --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
-  --set ingress.enabled=false \
-  --set ingress.host="${BASE_HOST}" \
-  --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
-  --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
-  --set apisix.enabled=true \
-  --set tensor_parallel_size="1" \
-  --set pipeline_parallel_size="1"
-```
-
-## Step 3: Test the Deployed Model
-
-```bash
-curl -k ${BASE_URL}/Llama-3.1-8B-Instruct-vllmcpu/v1/completions \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer $TOKEN" \
-  -d '{
-    "model": "meta-llama/Llama-3.1-8B-Instruct",
-    "prompt": "What is Deep Learning?",
-    "max_tokens": 25,
-    "temperature": 0
-  }'
-```
-
-## To undeploy the model
-
-```bash
-helm uninstall vllm-llama-8b-cpu
-```
-## Parameters
-
-| Parameter                                                 | Description                                                                                           |
-| --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
-| `--set LLM_MODEL_ID="meta-llama/Llama-3.1-8B-Instruct"`   | Defines the target model from **Hugging Face** to deploy.                                             |
-| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`             | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
-| `--set ingress.enabled=true`                              | Enables Kubernetes **Ingress** to expose the model service externally.                                |
-| `--set ingress.host="replace-ingress"`                    | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
-| `--set ingress.secretname="replace-secret"`               | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
-
-
-
-
-
-
-
-
-
diff --git a/third_party/Dell/model-deployment/troubleshooting.md b/third_party/Dell/model-deployment/troubleshooting.md
index 68d4b7be..c555802d 100644
--- a/third_party/Dell/model-deployment/troubleshooting.md
+++ b/third_party/Dell/model-deployment/troubleshooting.md
@@ -9,7 +9,7 @@ This section provides common issues observed when running inference against mode
 
 ### 1. Gateway Timeout (504) on Inference Requests
 
-**Context:** Model deployed via Helm commands. Inference request sent through the ingress stack (ingress-nginx → APISIX → vLLM service).
+**Context:** Model deployed via Helm commands. Inference request sent through the ingress stack (ingress-nginx -> APISIX -> vLLM service).
 
 **Error:** Inference requests return `504 Gateway Timeout` after 60 seconds:
 
@@ -20,7 +20,7 @@ upstream timed out (110: Operation timed out) ... 60.001
 
 **Cause:**
 
-CPU-based model inference (`vllm-cpu`) generates tokens at ~0.3–0.4 tokens/s. Responses requiring more than ~24 tokens exceed the default 60s upstream timeout enforced by ingress-nginx and APISIX.
+CPU-based model inference (`vllm-cpu`) generates tokens at ~0.3-0.4 tokens/s. Responses requiring more than ~24 tokens exceed the default 60s upstream timeout enforced by ingress-nginx and APISIX.
 
 **Fix:**
 
@@ -75,5 +75,5 @@ Re-run the inference request and confirm a `200 OK` response is returned within
 **Notes:**
 
 - The nginx ingress annotation takes effect immediately; no pod restart required.
-- For GPU-based deployments this timeout is rarely needed as throughput is significantly higher (30–50 tokens/s vs 0.3–0.4 tokens/s on CPU).
+- For GPU-based deployments this timeout is rarely needed as throughput is significantly higher (30-50 tokens/s vs 0.3-0.4 tokens/s on CPU).
 - If requests still time out after increasing both timeouts, reduce `max_tokens` in the request payload to limit response length.

From 971441a84d8a9ee09a8c7b07ebf74f856e52df7f Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Wed, 27 May 2026 17:55:18 -0500
Subject: [PATCH 8/8] Remove README.md from model-deployment folder

---
 third_party/Dell/model-deployment/README.md | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 third_party/Dell/model-deployment/README.md

diff --git a/third_party/Dell/model-deployment/README.md b/third_party/Dell/model-deployment/README.md
deleted file mode 100644
index 43d98118..00000000
--- a/third_party/Dell/model-deployment/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# PLACEHOLDER
\ No newline at end of file