From 2b753d61c7a0a7a29655f90a3113dde47af17c36 Mon Sep 17 00:00:00 2001
From: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
Date: Tue, 21 Apr 2026 14:48:45 -0500
Subject: [PATCH 1/3] feat: add DeepSeek-R1-Distill-Llama-8B model card and
 deployment guide for Dell EI

Signed-off-by: arpannookala-12 <ganesh.arpan.nookala@cloud2labs.com>
---
 .../deployment.md                             | 77 +++++++++++++++++++
 .../model-card.md                             | 67 ++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md
 create mode 100644 third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/model-card.md

diff --git a/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md b/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md
new file mode 100644
index 00000000..d806deb3
--- /dev/null
+++ b/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md
@@ -0,0 +1,77 @@
+
+# Deployed with EI Version-1.3.1
+
+## Step 1: Set Environment Variables
+
+```bash
+# Export Hugging Face token
+export HUGGING_FACE_HUB_TOKEN="your_token_here"
+
+# Set your base URL and API token
+export BASE_HOST="your-cluster-url"
+
+#generate keyclock token
+export BASE_URL="https://your-cluster-url"
+export KEYCLOAK_CLIENT_ID=api
+export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
+export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
+```
+
+## Step 2: Deploy Deepseek R1 Distill Llama Model
+
+```bash
+helm install deepseek-r1-distill-cpu ./core/helm-charts/vllm \
+  --values ./core/helm-charts/vllm/xeon-values.yaml \
+  --set LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
+  --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
+  --set ingress.enabled=false \
+  --set ingress.host="${BASE_HOST}" \
+  --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
+  --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
+  --set apisix.enabled=true \
+  --set tensor_parallel_size="1" \
+  --set pipeline_parallel_size="1"
+```
+
+## Step 3: Test the Deployed Model
+
+```bash
+ curl -k ${BASE_URL}/DeepSeek-R1-Distill-Llama-8B-vllmcpu/v1/completions \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $TOKEN" \
+  -d '{
+    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "prompt": "What is Deep Learning?",
+    "max_tokens": 25,
+    "temperature": 0
+  }'
+```
+
+## To undeploy the model
+
+```bash
+helm uninstall deepseek-r1-distill-cpu
+```
+
+## Parameters
+
+| Parameter                                                      | Description                                                                                           |
+| ---------------------------------------------------------------| ----------------------------------------------------------------------------------------------------- |
+| `--set LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"`| Defines the target model from **Hugging Face** to deploy.                                             |
+| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`                  | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
+| `--set ingress.enabled=true`                                   | Enables Kubernetes **Ingress** to expose the model service externally.                                |
+| `--set ingress.host="replace-ingress"`                         | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
+| `--set ingress.secretname="replace-secret"`                    | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/model-card.md b/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/model-card.md
new file mode 100644
index 00000000..8a2659a3
--- /dev/null
+++ b/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/model-card.md
@@ -0,0 +1,67 @@
+# DeepSeek-R1-Distill-Llama-8B
+
+This model uses DeepSeek-R1-Llama-8B, an 8-billion-parameter reasoning model distilled from the larger DeepSeek-R1 family and built upon Meta’s Llama architecture. It is optimized for lightweight deployment, faster inference, and efficient reasoning performance while preserving strong capabilities in logic, dialogue, and code generation.
+DeepSeek’s R1 reinforcement learning process and distillation techniques enable this smaller variant to maintain high reasoning quality with substantially reduced computational requirements.
+
+For complete technical details, licensing, evaluation metrics, and usage guidelines, please refer to the official Hugging Face model page:
+
+https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+
+This model provides inference-only access and is distributed under the DeepSeek license.
+
+Ensure full compliance with the DeepSeek and Meta licensing terms before integrating this model into any application or service.
+
+### Model Attribution
+
+**Developer:**	DeepSeek AI
+
+**purpose:** Lightweight reasoning, dialogue, and code generation
+
+**Sizes/Variants:**	8B distilled reasoning model
+
+**Modalities:**	Text → Text (Reasoning, Coding, and Dialogue)
+
+**Parameter Size:** 8 billion
+
+**Max Context:**	~64K tokens (backend dependent)
+
+**License:** DeepSeek License (use-restricted; see Hugging Face page)
+
+**Minimum required CPU Cores:** 157
+
+### Usage Notice
+
+**By using this model, you agree that:**
+
+- All data is processed through the DeepSeek-R1-Llama-8B model hosted under the DeepSeek license.
+- You must follow the DeepSeek and Meta licensing requirements, including possible non-commercial or restricted-use clauses.
+- Generated content (text, reasoning traces, or code) must be validated for correctness and safety before production use.
+- The model must not be used to produce harmful content, misinformation, or automated decisions in critical or regulated domains.
+
+### Intended Applications
+
+- Lightweight and cost-efficient reasoning and problem-solving
+- Assistant-style multi-turn conversations
+- Code generation, completion, and debugging (Python, Go, JavaScript, etc.)
+- Educational tools, research prototypes, and RAG-based assistant systems
+- Baselines for fine-tuning and further distillation research
+- On-device or edge inference scenarios with GPU/memory constraints
+
+### Limitations
+
+- May produce inaccurate or incomplete reasoning steps
+- Smaller size may reduce performance on highly complex logic or long-context tasks
+- Not suited for safety-critical or regulated environments
+- License may restrict commercial use
+- Inference performance depends on optimized backends for smaller Llama-based models
+
+### References
+
+DeepSeek Official Site: https://deepseek.ai
+
+Hugging Face Model Card: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+
+Llama Architecture Reference: https://huggingface.co/meta-llama
+
+
+

From 2e0464f17f3faab92133ba67cc56fdce1361a058 Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Tue, 5 May 2026 12:25:59 -0500
Subject: [PATCH 2/3] update deepseek r1 distill llama 8b

---
 .../deployment.md                             | 94 ++++++++++++-------
 1 file changed, 59 insertions(+), 35 deletions(-)

diff --git a/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md b/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md
index d806deb3..e7421ff8 100644
--- a/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md
+++ b/third_party/Dell/model-deployment/DeepSeek-R1-Distill-Llama-8B/deployment.md
@@ -1,31 +1,37 @@
+## Step 1: Prerequisites to Deploy DeepSeek-R1-Distill-Llama-8B Model on Xeon with Keycloak
 
-# Deployed with EI Version-1.3.1
+Ensure the Enterprise Inference stack with Keycloak is already deployed before proceeding.
 
-## Step 1: Set Environment Variables
+Edit `core/scripts/generate-token.sh` and set your values before sourcing it:
+
+| Variable                  | Description                                                              |
+| ------------------------- | ------------------------------------------------------------------------ |
+| `BASE_URL`                | Hostname of your cluster (e.g. `api.example.com`), without `https://`   |
+| `KEYCLOAK_ADMIN_USERNAME` | Keycloak admin username                                                  |
+| `KEYCLOAK_PASSWORD`       | Keycloak admin password                                                  |
+| `KEYCLOAK_CLIENT_ID`      | Keycloak client ID configured during EI deployment                       |
+
+Then run:
 
 ```bash
-# Export Hugging Face token
 export HUGGING_FACE_HUB_TOKEN="your_token_here"
 
-# Set your base URL and API token
-export BASE_HOST="your-cluster-url"
-
-#generate keyclock token
-export BASE_URL="https://your-cluster-url"
-export KEYCLOAK_CLIENT_ID=api
-export KEYCLOAK_CLIENT_SECRET="your keyclock client secret"
-export TOKEN=$(curl -k -X POST $BASE_URL/token  -H 'Content-Type: application/x-www-form-urlencoded' -d "grant_type=client_credentials&client_id=${KEYCLOAK_CLIENT_ID}&client_secret=${KEYCLOAK_CLIENT_SECRET}" | jq -r .access_token)
+cd ~/Enterprise-Inference
+source core/scripts/generate-token.sh
 ```
 
-## Step 2: Deploy Deepseek R1 Distill Llama Model
+This exports: `BASE_URL`, `KEYCLOAK_CLIENT_ID`, `KEYCLOAK_CLIENT_SECRET`, and `TOKEN`.
+
+## Step 2: Deploy DeepSeek-R1-Distill-Llama-8B Model
 
 ```bash
 helm install deepseek-r1-distill-cpu ./core/helm-charts/vllm \
   --values ./core/helm-charts/vllm/xeon-values.yaml \
   --set LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
   --set global.HUGGINGFACEHUB_API_TOKEN="$HUGGING_FACE_HUB_TOKEN" \
-  --set ingress.enabled=false \
-  --set ingress.host="${BASE_HOST}" \
+  --set ingress.enabled=true \
+  --set ingress.secretname="${BASE_URL}" \
+  --set ingress.host="${BASE_URL}" \
   --set oidc.client_id="$KEYCLOAK_CLIENT_ID" \
   --set oidc.client_secret="$KEYCLOAK_CLIENT_SECRET" \
   --set apisix.enabled=true \
@@ -33,10 +39,33 @@ helm install deepseek-r1-distill-cpu ./core/helm-charts/vllm \
   --set pipeline_parallel_size="1"
 ```
 
-## Step 3: Test the Deployed Model
+## Step 3: Verify the Deployment
 
 ```bash
- curl -k ${BASE_URL}/DeepSeek-R1-Distill-Llama-8B-vllmcpu/v1/completions \
+kubectl get pods
+kubectl get apisixroutes
+```
+
+Expected Output:
+
+```
+NAME                                          READY   STATUS    RESTARTS
+keycloak-0                                    1/1     Running   0
+keycloak-postgresql-0                         1/1     Running   0
+deepseek-r1-distill-cpu-<hash>-<hash>         1/1     Running   0
+```
+
+> Note: The pod name suffix `<hash>-<hash>` is auto-generated by Kubernetes and will differ on each deployment. Ensure all pods show `1/1 Running`.
+
+```
+NAME                                    HOSTS
+deepseek-r1-distill-cpu-apisixroute     api.example.com
+```
+
+## Step 4: Test the Deployed Model
+
+```bash
+curl -k https://${BASE_URL}/DeepSeek-R1-Distill-Llama-8B-vllmcpu/v1/completions \
   -X POST \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer $TOKEN" \
@@ -48,6 +77,8 @@ helm install deepseek-r1-distill-cpu ./core/helm-charts/vllm \
   }'
 ```
 
+If successful, the model will return a completion response.
+
 ## To undeploy the model
 
 ```bash
@@ -56,22 +87,15 @@ helm uninstall deepseek-r1-distill-cpu
 
 ## Parameters
 
-| Parameter                                                      | Description                                                                                           |
-| ---------------------------------------------------------------| ----------------------------------------------------------------------------------------------------- |
-| `--set LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"`| Defines the target model from **Hugging Face** to deploy.                                             |
-| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`                  | Authenticates access to gated or private Hugging Face models. Replace with your own secure token.     |
-| `--set ingress.enabled=true`                                   | Enables Kubernetes **Ingress** to expose the model service externally.                                |
-| `--set ingress.host="replace-ingress"`                         | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).              |
-| `--set ingress.secretname="replace-secret"`                    | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                            |
-
-
-
-
-
-
-
-
-
-
-
-
+| Parameter                                                       | Description                                                                                       |
+| --------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| `--set LLM_MODEL_ID="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"`| Defines the target model from **Hugging Face** to deploy.                                         |
+| `--set global.HUGGINGFACEHUB_API_TOKEN="..."`                   | Authenticates access to gated or private Hugging Face models. Replace with your own secure token. |
+| `--set ingress.enabled=true`                                    | Enables Kubernetes **Ingress** to expose the model service externally.                            |
+| `--set ingress.host="${BASE_URL}"`                              | Public hostname or FQDN for the inference endpoint (maps to your Ingress controller IP).          |
+| `--set ingress.secretname="${BASE_URL}"`                        | Kubernetes **TLS Secret** used for HTTPS termination at the ingress layer.                        |
+| `--set oidc.client_id="..."`                                    | Keycloak OIDC client ID used for token-based authentication.                                      |
+| `--set oidc.client_secret="..."`                                | Keycloak OIDC client secret corresponding to the client ID.                                       |
+| `--set apisix.enabled=true`                                     | Enables **APISIX** as the API gateway for routing and authentication.                             |
+| `--set tensor_parallel_size="1"`                                | Number of tensor parallel workers. Set to the number of available CPUs/GPUs per node.            |
+| `--set pipeline_parallel_size="1"`                              | Number of pipeline parallel stages. Typically `1` for single-node deployments.                   |

From 12fdfaa6fe3e34e0310ede39916f6060daa7f7ea Mon Sep 17 00:00:00 2001
From: Harika <codewith3@gmail.com>
Date: Wed, 27 May 2026 17:53:35 -0500
Subject: [PATCH 3/3] Remove README.md from model-deployment folder

---
 third_party/Dell/model-deployment/README.md | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 third_party/Dell/model-deployment/README.md

diff --git a/third_party/Dell/model-deployment/README.md b/third_party/Dell/model-deployment/README.md
deleted file mode 100644
index 43d98118..00000000
--- a/third_party/Dell/model-deployment/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# PLACEHOLDER
\ No newline at end of file