From 3c03b811773e1b96dc6e270c43a2319e09e90fcd Mon Sep 17 00:00:00 2001
From: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
Date: Tue, 21 Apr 2026 14:44:12 -0500
Subject: [PATCH 1/7] feat: add TinyLlama-1.1B-Chat-v1.0 model card and
 deployment guide for Dell EI

Signed-off-by: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
---
 .../TinyLlama-1.1B-Chat-v1.0/deployment.md    | 77 +++++++++++++++++++
 .../TinyLlama-1.1B-Chat-v1.0/model-card.md    | 60 +++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
 create mode 100644 third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/model-card.md

diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
new file mode 100644
index 00000000..6a02350d
--- /dev/null
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
@@ -0,0 +1,77 @@
+
+# Deployed with EI Version-1.3.1
+
+## Step 1: Set Environment Variables
+
+```bash
+# Export Hugging Face token
+export HUGGING_FACE_HUB_TOKEN="your_token_here"
+
+# Set your base URL and API token
+export BASE_HOST="your-cluster-url"
+
+#generate keyclock token
+export BASE_URL="https://your-cluster-url"
+export KEYCLOAK_CLIENT_ID=api
+export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
+export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
+```
+
+## Step 2: Deploy Tinyllama-1.1b-chat-v1.0 Model
+
+```bash
+helm install Tinyllama-1-1b-cpu ./core/helm-charts/vllm \
+  --values ./core/helm-charts/vllm/xeon-values.yaml \
+  --set LLM_MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
+  --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+  --set ingress.enabled=false \
+  --set ingress.host="${BASE_HOST}" \
+  --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
+  --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
+  --set apisix.enabled=true \
+  --set tensor_parallel_size="1" \
+  --set pipeline_parallel_size="1"
+```
+
+## Step 3: Test the Deployed Model
+
+```bash
+curl -k ${BASE_URL}/TinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $TOKEN" \
+  -d '{
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "prompt": "What is Deep Learning?",
+    "max_tokens": 25,
+    "temperature": 0
+  }'
+```
+
+## To undeploy the model
+
+```bash
+helm uninstall Tinyllama-1-1b-cpu
+```
+
+## Parameters
+
+| Parameter                                                 | Description                                                                                           |
+| --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
+| `--set LLM_MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0"` | Defines the target model from **Hugging Face** to deploy.                                             |
+| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`             | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
+| `--set ingress.enabled=true`                              | Enables Kubernetes **Ingress** to expose the model service externally.                                |
+| `--set ingress.host="replace-ingress"`                    | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
+| `--set ingress.secretname="replace-secret"`               | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/model-card.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/model-card.md
new file mode 100644
index 00000000..c14ec2e2
--- /dev/null
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/model-card.md
@@ -0,0 +1,60 @@
+# TinyLlama-1.1B-Chat-v1.0
+
+This model uses TinyLlama-1.1B-Chat-v1.0, a compact large language model developed by the TinyLlama Project team. It is a chat-tuned variant of the TinyLlama 1.1B base model, optimized for instruction-following, conversational AI, and lightweight reasoning tasks. Despite its small size, TinyLlama delivers strong performance for edge AI, embedded systems, rapid prototyping, and cost-efficient inference scenarios.
+
+For full details including model specifications, licensing, intended use, safety guidance, and example prompts, please visit the official Hugging Face page: **Official Hugging Face Page**
+
+https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0
+
+This model provides inference services only; weights are hosted by Hugging Face under the Apache 2.0 License.
+
+Ensure compliance with the Apache 2.0 License terms before using this model.
+
+### Model Attribution
+
+**Developer:**	TinyLlama Project
+
+**purpose:** Lightweight instruction-tuned conversational AI
+
+**Sizes/Variants:**	1.1B parameters
+
+**Modalities:**	Text → Natural Language
+
+**Parameter Size:** 1.1 Billion
+
+**Max Context:**	~2K tokens
+
+**License:** Apache 2.0 (commercial-friendly)
+
+### Usage Notice
+
+**By using this model, you agree that:**
+
+- Inputs and outputs are processed by the TinyLlama-1.1B-Chat-v1.0 model under the Apache 2.0 license.
+- You are responsible for validating outputs before production use.
+- This model should not be used for generating malicious, deceptive, or unsafe content.
+- Outputs may contain inaccuracies and must be reviewed for correctness and compliance.
+
+### Intended Applications
+
+- Lightweight chatbots and virtual assistants
+- Edge AI and on-device inference
+- Rapid prototyping and AI experimentation
+- CPU-based conversational agents
+- Educational tools and demos
+- RAG-based document assistants for low-resource environments
+- Dev/test automation helpers
+
+### Limitations
+
+- Limited reasoning depth compared to large models (7B+)
+- Reduced long-context understanding
+- Not suitable for complex multi-step logic or heavy code generation
+- May hallucinate or oversimplify responses
+- Not designed for safety-critical or regulated decision systems
+
+### References
+
+TinyLlama Project — https://github.com/jzhang38/TinyLlama
+
+Hugging Face Model Page —  https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

From 77baea32ad722da729fd536325b14d4bcac074d1 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Fri, 1 May 2026 12:27:32 -0500
Subject: [PATCH 2/7] update tinyllama deployment guide

---
 .../TinyLlama-1.1B-Chat-v1.0/deployment.md      | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
index 6a02350d..1af35931 100644
--- a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
@@ -1,26 +1,17 @@
-
-# Deployed with EI Version-1.3.1
-
 ## Step 1: Set Environment Variables
 
 ```bash
 # Export Hugging Face token
 export HUGGING_FACE_HUB_TOKEN="your_token_here"
 
-# Set your base URL and API token
-export BASE_HOST="your-cluster-url"
-
-#generate keyclock token
-export BASE_URL="https://your-cluster-url"
-export KEYCLOAK_CLIENT_ID=api
-export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
-export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
+cd ~/Enterprise-Inference
+source core/scripts/generate-token.sh
 ```
 
 ## Step 2: Deploy Tinyllama-1.1b-chat-v1.0 Model
 
 ```bash
-helm install Tinyllama-1-1b-cpu ./core/helm-charts/vllm \
+helm install tinyllama-1-1b-cpu ./core/helm-charts/vllm \
   --values ./core/helm-charts/vllm/xeon-values.yaml \
   --set LLM_MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
   --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
@@ -36,7 +27,7 @@ helm install Tinyllama-1-1b-cpu ./core/helm-charts/vllm \
 ## Step 3: Test the Deployed Model
 
 ```bash
-curl -k ${BASE_URL}/TinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
+curl -k ${BASE_URL}/tinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
   -X POST \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer $TOKEN" \

From 2190ba6d8a51940eafcd2a4375cd00f56e0b2f98 Mon Sep 17 00:00:00 2001
From: Harika <harika.devulapally@cloud2labs.com>
Date: Mon, 4 May 2026 22:18:56 -0500
Subject: [PATCH 3/7] Enable ingress and update deployment instructions

---
 .../TinyLlama-1.1B-Chat-v1.0/deployment.md               | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
index 1af35931..de74fc9a 100644
--- a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
@@ -15,8 +15,9 @@ helm install tinyllama-1-1b-cpu ./core/helm-charts/vllm \
   --values ./core/helm-charts/vllm/xeon-values.yaml \
   --set LLM_MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
   --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
-  --set ingress.enabled=false \
-  --set ingress.host="${BASE_HOST}" \
+  --set ingress.enabled=true \
+  --set ingress.secretname=api.example.com \
+  --set ingress.host="${BASE_URL}" \
   --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
   --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
   --set apisix.enabled=true \
@@ -27,7 +28,7 @@ helm install tinyllama-1-1b-cpu ./core/helm-charts/vllm \
 ## Step 3: Test the Deployed Model
 
 ```bash
-curl -k ${BASE_URL}/tinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
+curl -k https://${BASE_URL}/tinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
   -X POST \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer $TOKEN" \
@@ -42,7 +43,7 @@ curl -k ${BASE_URL}/tinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
 ## To undeploy the model
 
 ```bash
-helm uninstall Tinyllama-1-1b-cpu
+helm uninstall tinyllama-1-1b-cpu
 ```
 
 ## Parameters

From 3e95da3bfcb930468b93adf69cf1b67946cdfc29 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 10:18:36 -0500
Subject: [PATCH 4/7] update tinyllama deployment.md

---
 .../TinyLlama-1.1B-Chat-v1.0/deployment.md    | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
index de74fc9a..f07226e5 100644
--- a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
@@ -1,4 +1,4 @@
-## Step 1: Set Environment Variables
+## Step 1: pre-requisite to deploy EI with keycloak on Xeon
 
 ```bash
 # Export Hugging Face token
@@ -25,7 +25,30 @@ helm install tinyllama-1-1b-cpu ./core/helm-charts/vllm \
   --set pipeline_parallel_size="1"
 ```
 
-## Step 3: Test the Deployed Model
+## Step 3: Verify the Deployment
+
+```bash
+kubectl get pods
+kubectl get apisixroutes
+```
+
+Excepted Output:
+
+```
+NAME                                        READY   STATUS    RESTARTS
+keycloak-0                                  1/1     Running   0
+keycloak-postgresql-0                       1/1     Running   0
+tinyllama-1-1b-cpu-vllm-<hash>-<hash>       1/1     Running   0
+```
+
+> Note: The pod name suffix `<hash>-<hash>` is auto-generated by Kubernetes and will differ on each deployment. Ensure all pods show `1/1 Running`.
+
+```
+NAME                                    HOSTS
+tinyllama-1-1b-cpu-vllm-apisixroute     api.example.com
+```
+
+## Step 4: Test the Deployed Model
 
 ```bash
 curl -k https://${BASE_URL}/tinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \

From 3ff7abefcc19e2e19585b7e5adb7f1ae48e88fed Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 11:09:56 -0500
Subject: [PATCH 5/7] update tinyllama deployment.md

---
 .../TinyLlama-1.1B-Chat-v1.0/deployment.md    | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
index f07226e5..5eea9f44 100644
--- a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
@@ -1,13 +1,27 @@
-## Step 1: pre-requisite to deploy EI with keycloak on Xeon
+## Step 1: Prerequisites to Deploy TinyLlama Model on Xeon with Keycloak
+
+Ensure the Enterprise Inference stack with Keycloak is already deployed before proceeding.
+
+Edit `core/scripts/generate-token.sh` and set your values before sourcing it:
+
+| Variable                  | Description                                                              |
+| ------------------------- | ------------------------------------------------------------------------ |
+| `BASE_URL`                | Hostname of your cluster (e.g. `api.example.com`), without `https://`   |
+| `KEYCLOAK_ADMIN_USERNAME` | Keycloak admin username                                                  |
+| `KEYCLOAK_PASSWORD`       | Keycloak admin password                                                  |
+| `KEYCLOAK_CLIENT_ID`      | Keycloak client ID configured during EI deployment                       |
+
+Then run:
 
 ```bash
-# Export Hugging Face token
 export HUGGING_FACE_HUB_TOKEN="your_token_here"
 
 cd ~/Enterprise-Inference
 source core/scripts/generate-token.sh
 ```
 
+This exports: `BASE_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_CLIENT_SECRET`, and `TOKEN`.
+
 ## Step 2: Deploy Tinyllama-1.1b-chat-v1.0 Model
 
 ```bash
@@ -32,7 +46,7 @@ kubectl get pods
 kubectl get apisixroutes
 ```
 
-Excepted Output:
+Expected Output:
 
 ```
 NAME                                        READY   STATUS    RESTARTS
@@ -63,6 +77,8 @@ curl -k https://${BASE_URL}/tinyLlama-1.1B-Chat-v1.0-vllmcpu/v1/completions \
   }'
 ```
 
+If successful, the model will return a completion response.
+
 ## To undeploy the model
 
 ```bash
@@ -78,6 +94,11 @@ helm uninstall tinyllama-1-1b-cpu
 | `--set ingress.enabled=true`                              | Enables Kubernetes **Ingress** to expose the model service externally.                                |
 | `--set ingress.host="replace-ingress"`                    | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
 | `--set ingress.secretname="replace-secret"`               | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
+| `--set oidc.client_id="..."`                              | Keycloak OIDC client ID used for token-based authentication.                                          |
+| `--set oidc.client_secret="..."`                          | Keycloak OIDC client secret corresponding to the client ID.                                           |
+| `--set apisix.enabled=true`                               | Enables **APISIX** as the API gateway for routing and authentication.                                 |
+| `--set tensor_parallel_size="1"`                          | Number of tensor parallel workers. Set to the number of available CPUs/GPUs per node.                |
+| `--set pipeline_parallel_size="1"`                        | Number of pipeline parallel stages. Typically `1` for single-node deployments.                       |
 
 
 

From 8725bce6a8ddb017722d128c84d1ead2020f89b4 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 11:14:57 -0500
Subject: [PATCH 6/7] update tinyllama deployment.md

---
 .../model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
index 5eea9f44..4ed9503b 100644
--- a/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
+++ b/third_party/Dell/model-deployment/TinyLlama-1.1B-Chat-v1.0/deployment.md
@@ -30,7 +30,7 @@ helm install tinyllama-1-1b-cpu ./core/helm-charts/vllm \
   --set LLM_MODEL_ID="TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
   --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
   --set ingress.enabled=true \
-  --set ingress.secretname=api.example.com \
+  --set ingress.secretname="${BASE_URL}" \
   --set ingress.host="${BASE_URL}" \
   --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
   --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \

From 65821b9299cadcda378de653102080c69efa9897 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Wed, 27 May 2026 17:45:55 -0500
Subject: [PATCH 7/7] Remove README.md from model-deployment folder

---
 third_party/Dell/model-deployment/README.md | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 third_party/Dell/model-deployment/README.md

diff --git a/third_party/Dell/model-deployment/README.md b/third_party/Dell/model-deployment/README.md
deleted file mode 100644
index 43d98118..00000000
--- a/third_party/Dell/model-deployment/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# PLACEHOLDER
\ No newline at end of file