From 81eb61b97d03b118cf38008fa0de526a2146708e Mon Sep 17 00:00:00 2001
From: srih0527 <srihari.pinnamaraju@rackspace.com>
Date: Tue, 17 Mar 2026 14:33:04 -0500
Subject: [PATCH 1/3] fix: Dockerfile build issues and add docker-compose +
 Postman collection

- Fix apt package issues (remove software-properties-common, lsb-release)
- Patch const-correctness error in ggml-bitnet-mad.cpp for clang
- Use pre-built GGUF model to skip broken HF-to-GGUF conversion
- Add docker-compose.yml for easy container management
- Add Postman collection covering all 19 API endpoints
---
 Dockerfile                             |  18 +-
 FastAPI-BitNet.postman_collection.json | 406 +++++++++++++++++++++++++
 docker-compose.yml                     |   7 +
 3 files changed, 425 insertions(+), 6 deletions(-)
 create mode 100644 FastAPI-BitNet.postman_collection.json
 create mode 100644 docker-compose.yml

diff --git a/Dockerfile b/Dockerfile
index 1ab39dc..b2ba007 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,23 +10,29 @@ RUN git clone --recursive https://github.com/microsoft/BitNet.git /tmp/BitNet &&
     rm -rf /tmp/BitNet
 
 # Install dependencies
-RUN apt-get update && apt-get install -y \
+RUN apt-get update --fix-missing && apt-get install -y --no-install-recommends \
     wget \
-    lsb-release \
-    software-properties-common \
     gnupg \
     cmake \
     clang \
-    && bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" \
+    ca-certificates \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+# Patch const-correctness error in BitNet source (clang is stricter than gcc)
+RUN sed -i 's/int8_t \* y_col = y + col \* by;/const int8_t * y_col = y + col * by;/' /code/src/ggml-bitnet-mad.cpp
+
 # Install Python dependencies
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt && \
     pip install "fastapi[standard]" "uvicorn[standard]" httpx fastapi-mcp psutil
 
-# (Optional) Run your setup_env.py if needed
-RUN python /code/setup_env.py -md /code/models/BitNet-b1.58-2B-4T -q i2_s
+# Download pre-built GGUF model (skips the broken HF-to-GGUF conversion)
+# Use the exact model name "BitNet-b1.58-2B-4T" so setup_env.py recognizes it
+RUN huggingface-cli download microsoft/bitnet-b1.58-2B-4T-gguf --local-dir /code/models/BitNet-b1.58-2B-4T
+
+# Run setup (compile + codegen, model already has the gguf so conversion is skipped)
+RUN python /code/setup_env.py -md /code/models/BitNet-b1.58-2B-4T -q i2_s 2>&1 \
+    || (cat /code/logs/*.log 2>/dev/null; exit 1)
 
 EXPOSE 8080
 
diff --git a/FastAPI-BitNet.postman_collection.json b/FastAPI-BitNet.postman_collection.json
new file mode 100644
index 0000000..ed15e49
--- /dev/null
+++ b/FastAPI-BitNet.postman_collection.json
@@ -0,0 +1,406 @@
+{
+	"info": {
+		"name": "FastAPI BitNet Orchestrator",
+		"description": "API collection for managing and interacting with BitNet llama server & CLI instances.",
+		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
+	},
+	"variable": [
+		{
+			"key": "baseUrl",
+			"value": "http://localhost:8080",
+			"type": "string"
+		}
+	],
+	"item": [
+		{
+			"name": "Server Management",
+			"item": [
+				{
+					"name": "Estimate Max BitNet Servers",
+					"request": {
+						"method": "GET",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/estimate?per_server_gb=1.5",
+							"host": ["{{baseUrl}}"],
+							"path": ["estimate"],
+							"query": [
+								{
+									"key": "per_server_gb",
+									"value": "1.5",
+									"description": "Estimated RAM usage per server in GB"
+								}
+							]
+						},
+						"description": "Estimate the maximum number of BitNet server instances based on RAM and CPU threads."
+					}
+				},
+				{
+					"name": "Initialize Server",
+					"request": {
+						"method": "POST",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/initialize-server?threads=2&ctx_size=2048&port=8081&system_prompt=You are a helpful assistant.&n_predict=4096&temperature=0.8",
+							"host": ["{{baseUrl}}"],
+							"path": ["initialize-server"],
+							"query": [
+								{ "key": "threads", "value": "2", "description": "Number of threads" },
+								{ "key": "ctx_size", "value": "2048", "description": "Context size in tokens" },
+								{ "key": "port", "value": "8081", "description": "Port for the server instance" },
+								{ "key": "system_prompt", "value": "You are a helpful assistant.", "description": "System prompt" },
+								{ "key": "n_predict", "value": "4096", "description": "Max tokens to predict" },
+								{ "key": "temperature", "value": "0.8", "description": "Sampling temperature" }
+							]
+						},
+						"description": "Initialize and start a single BitNet llama-server process."
+					}
+				},
+				{
+					"name": "Shutdown Server",
+					"request": {
+						"method": "POST",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/shutdown-server?port=8081",
+							"host": ["{{baseUrl}}"],
+							"path": ["shutdown-server"],
+							"query": [
+								{ "key": "port", "value": "8081", "description": "Port of the server to shut down" }
+							]
+						},
+						"description": "Shut down a specific BitNet server instance by port."
+					}
+				},
+				{
+					"name": "Get Server Status",
+					"request": {
+						"method": "GET",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/server-status?port=8081",
+							"host": ["{{baseUrl}}"],
+							"path": ["server-status"],
+							"query": [
+								{ "key": "port", "value": "8081", "description": "Port of the server to check" }
+							]
+						},
+						"description": "Get the status of a specific BitNet server instance."
+					}
+				}
+			]
+		},
+		{
+			"name": "Batch Server Management",
+			"item": [
+				{
+					"name": "Initialize Batch Servers",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"servers\": [\n    {\n      \"threads\": 1,\n      \"ctx_size\": 2048,\n      \"port\": 8081,\n      \"system_prompt\": \"You are a helpful assistant.\",\n      \"n_predict\": 256,\n      \"temperature\": 0.8\n    },\n    {\n      \"threads\": 1,\n      \"ctx_size\": 2048,\n      \"port\": 8082,\n      \"system_prompt\": \"You are a coding assistant.\",\n      \"n_predict\": 512,\n      \"temperature\": 0.6\n    }\n  ]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/initialize-batch-servers",
+							"host": ["{{baseUrl}}"],
+							"path": ["initialize-batch-servers"]
+						},
+						"description": "Initialize multiple BitNet server instances in one request."
+					}
+				},
+				{
+					"name": "Shutdown Batch Servers",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"ports\": [8081, 8082]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/shutdown-batch-servers",
+							"host": ["{{baseUrl}}"],
+							"path": ["shutdown-batch-servers"]
+						},
+						"description": "Shut down multiple BitNet server instances."
+					}
+				},
+				{
+					"name": "Batch Server Status",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"ports\": [8081, 8082]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/batch-server-status",
+							"host": ["{{baseUrl}}"],
+							"path": ["batch-server-status"]
+						},
+						"description": "Get the status of multiple BitNet server instances."
+					}
+				}
+			]
+		},
+		{
+			"name": "Interaction",
+			"item": [
+				{
+					"name": "Chat with BitNet Server",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"message\": \"Hello, what can you do?\",\n  \"port\": 8081,\n  \"n_predict\": 256,\n  \"temperature\": 0.8\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/chat",
+							"host": ["{{baseUrl}}"],
+							"path": ["chat"]
+						},
+						"description": "Send a chat message to a running BitNet server instance."
+					}
+				},
+				{
+					"name": "Multichat with BitNet Servers",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"requests\": [\n    {\n      \"message\": \"What is 2+2?\",\n      \"port\": 8081,\n      \"n_predict\": 128,\n      \"temperature\": 0.5\n    },\n    {\n      \"message\": \"Tell me a joke.\",\n      \"port\": 8082,\n      \"n_predict\": 256,\n      \"temperature\": 1.0\n    }\n  ]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/multichat",
+							"host": ["{{baseUrl}}"],
+							"path": ["multichat"]
+						},
+						"description": "Send multiple chat messages to BitNet servers concurrently."
+					}
+				}
+			]
+		},
+		{
+			"name": "Llama CLI Management",
+			"item": [
+				{
+					"name": "Initialize Llama CLI Session",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"cli_alias\": \"bitnet_001\",\n  \"threads\": 2,\n  \"ctx_size\": 2048,\n  \"n_predict\": 256,\n  \"temperature\": 0.8,\n  \"system_prompt\": \"You are a helpful assistant.\"\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/initialize-llama-cli",
+							"host": ["{{baseUrl}}"],
+							"path": ["initialize-llama-cli"]
+						},
+						"description": "Start a persistent llama-cli process in conversational mode."
+					}
+				},
+				{
+					"name": "Initialize Batch Llama CLI Sessions",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"requests\": [\n    {\n      \"cli_alias\": \"session_a\",\n      \"threads\": 1,\n      \"ctx_size\": 2048,\n      \"n_predict\": 256,\n      \"temperature\": 0.8\n    },\n    {\n      \"cli_alias\": \"session_b\",\n      \"threads\": 1,\n      \"ctx_size\": 2048,\n      \"n_predict\": 512,\n      \"temperature\": 0.6\n    }\n  ]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/initialize-batch-llama-cli",
+							"host": ["{{baseUrl}}"],
+							"path": ["initialize-batch-llama-cli"]
+						},
+						"description": "Start multiple persistent llama-cli sessions in one request."
+					}
+				},
+				{
+					"name": "Shutdown Llama CLI Session",
+					"request": {
+						"method": "POST",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/shutdown-llama-cli/bitnet_001",
+							"host": ["{{baseUrl}}"],
+							"path": ["shutdown-llama-cli", "bitnet_001"]
+						},
+						"description": "Stop a specific persistent llama-cli session by alias."
+					}
+				},
+				{
+					"name": "Shutdown Batch Llama CLI Sessions",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"aliases\": [\"session_a\", \"session_b\"]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/shutdown-batch-llama-cli",
+							"host": ["{{baseUrl}}"],
+							"path": ["shutdown-batch-llama-cli"]
+						},
+						"description": "Stop multiple persistent llama-cli sessions."
+					}
+				},
+				{
+					"name": "Get Llama CLI Session Status",
+					"request": {
+						"method": "GET",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/llama-cli-status/bitnet_001",
+							"host": ["{{baseUrl}}"],
+							"path": ["llama-cli-status", "bitnet_001"]
+						},
+						"description": "Get the status of a specific llama-cli session."
+					}
+				},
+				{
+					"name": "Batch Llama CLI Session Status",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"aliases\": [\"session_a\", \"session_b\"]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/batch-llama-cli-status",
+							"host": ["{{baseUrl}}"],
+							"path": ["batch-llama-cli-status"]
+						},
+						"description": "Get the status of multiple llama-cli sessions."
+					}
+				}
+			]
+		},
+		{
+			"name": "Llama CLI Interaction",
+			"item": [
+				{
+					"name": "Chat with Llama CLI Session",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"cli_alias\": \"bitnet_001\",\n  \"prompt\": \"What is the capital of France?\"\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/chat-llama-cli",
+							"host": ["{{baseUrl}}"],
+							"path": ["chat-llama-cli"]
+						},
+						"description": "Send a prompt to a running persistent llama-cli session."
+					}
+				},
+				{
+					"name": "Batch Chat with Llama CLI Sessions",
+					"request": {
+						"method": "POST",
+						"header": [
+							{ "key": "Content-Type", "value": "application/json" }
+						],
+						"body": {
+							"mode": "raw",
+							"raw": "{\n  \"requests\": [\n    {\n      \"cli_alias\": \"session_a\",\n      \"prompt\": \"What is 2+2?\"\n    },\n    {\n      \"cli_alias\": \"session_b\",\n      \"prompt\": \"Tell me a joke.\"\n    }\n  ]\n}"
+						},
+						"url": {
+							"raw": "{{baseUrl}}/batch-chat-llama-cli",
+							"host": ["{{baseUrl}}"],
+							"path": ["batch-chat-llama-cli"]
+						},
+						"description": "Send multiple chat prompts to llama-cli sessions concurrently."
+					}
+				}
+			]
+		},
+		{
+			"name": "Utilities",
+			"item": [
+				{
+					"name": "Run Benchmark",
+					"request": {
+						"method": "GET",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/benchmark?model=BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf&n_token=128&threads=2&n_prompt=32",
+							"host": ["{{baseUrl}}"],
+							"path": ["benchmark"],
+							"query": [
+								{ "key": "model", "value": "BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf", "description": "Model enum value" },
+								{ "key": "n_token", "value": "128", "description": "Number of tokens to process" },
+								{ "key": "threads", "value": "2", "description": "Number of threads" },
+								{ "key": "n_prompt", "value": "32", "description": "Number of prompt tokens" }
+							]
+						},
+						"description": "Run a performance benchmark on a BitNet model."
+					}
+				},
+				{
+					"name": "Calculate Perplexity",
+					"request": {
+						"method": "GET",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/perplexity?model=BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf&prompt=The quick brown fox jumps over the lazy dog and then runs across the field to find more food for the winter season ahead&threads=2&ctx_size=10&ppl_stride=0",
+							"host": ["{{baseUrl}}"],
+							"path": ["perplexity"],
+							"query": [
+								{ "key": "model", "value": "BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf", "description": "Model enum value" },
+								{ "key": "prompt", "value": "The quick brown fox jumps over the lazy dog and then runs across the field to find more food for the winter season ahead", "description": "Input text" },
+								{ "key": "threads", "value": "2" },
+								{ "key": "ctx_size", "value": "10" },
+								{ "key": "ppl_stride", "value": "0" }
+							]
+						},
+						"description": "Calculate perplexity of text using a BitNet model."
+					}
+				},
+				{
+					"name": "Get Model Sizes",
+					"request": {
+						"method": "GET",
+						"header": [],
+						"url": {
+							"raw": "{{baseUrl}}/model-sizes",
+							"host": ["{{baseUrl}}"],
+							"path": ["model-sizes"]
+						},
+						"description": "Get file sizes of all available .gguf model files."
+					}
+				}
+			]
+		}
+	]
+}
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..0477cc9
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,7 @@
+version: "3.8"
+
+services:
+  bitnet-api:
+    build: .
+    ports:
+      - "8080:8080"

From d407312ea861f7c834b0e6872e32f8fe4f2cbbb1 Mon Sep 17 00:00:00 2001
From: srih0527 <srihari.pinnamaraju@rackspace.com>
Date: Tue, 17 Mar 2026 15:01:40 -0500
Subject: [PATCH 2/3] fix: use /v1/chat/completions with LLaMA 3 chat template
 for clean responses

---
 app/lib/endpoints/chat_endpoints.py | 48 ++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/app/lib/endpoints/chat_endpoints.py b/app/lib/endpoints/chat_endpoints.py
index df94ff0..495cf4f 100644
--- a/app/lib/endpoints/chat_endpoints.py
+++ b/app/lib/endpoints/chat_endpoints.py
@@ -21,7 +21,7 @@ class ChatRequest(BaseModel):
     port: int = 8081
     threads: int = Field(default_factory=lambda: os.cpu_count() or 1, gt=0)
     ctx_size: int = Field(default=2048, gt=0)
-    n_predict: int = Field(default=256, gt=0)
+    n_predict: int = Field(default=128, gt=0)
     temperature: float = Field(default=0.8, gt=0.0, le=2.0)
 
 
@@ -35,6 +35,26 @@ class MultiChatRequest(BaseModel):
 
 # --- Endpoint logic functions ---
 
+def _clean_response(text: str) -> str:
+    """Strip repetitive or meta-text patterns from raw model output."""
+    import re
+    # Take only the first meaningful answer block
+    # Stop at patterns like "Question:", "Input:", "Output:", "(no answer)", "(end of answer)"
+    cut_patterns = [
+        r'\n\s*Question\s*:',
+        r'\n\s*Input\s*:',
+        r'\n\s*Output\s*:',
+        r'\(no answer\)',
+        r'\(end of answer\)',
+        r'\(No answer required\)',
+    ]
+    for pat in cut_patterns:
+        m = re.search(pat, text, re.IGNORECASE)
+        if m:
+            text = text[:m.start()]
+    return text.strip()
+
+
 async def handle_chat_with_bitnet_server(chat: ChatRequest):
     host = "127.0.0.1"
     key = (host, chat.port)
@@ -43,11 +63,20 @@ async def handle_chat_with_bitnet_server(chat: ChatRequest):
     if not (proc_entry and proc_entry["process"].returncode is None and cfg):
         logger.warning(f"Chat request to non-existent or stopped server on port {chat.port}.")
         raise HTTPException(status_code=404, detail=f"Server on port {chat.port} not running or not configured.")
-    server_url = f"http://{host}:{chat.port}/completion"
+
+    # Use the OpenAI-compatible chat completions endpoint so llama-server
+    # applies the correct LLaMA 3 chat template automatically.
+    server_url = f"http://{host}:{chat.port}/v1/chat/completions"
+
+    system_prompt = cfg.get("system_prompt", "You are a helpful assistant.")
     payload = {
-        "prompt": chat.message,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": chat.message},
+        ],
         "n_predict": chat.n_predict,
         "temperature": chat.temperature,
+        "stop": ["\nQuestion:", "\nInput:", "\nOutput:", "\n\nQuestion:", "\n\nInput:"],
     }
 
     try:
@@ -55,8 +84,17 @@ async def handle_chat_with_bitnet_server(chat: ChatRequest):
             response = await client.post(server_url, json=payload, timeout=60.0)
             response.raise_for_status()
             response_data = response.json()
-            # Ensure the key "content" exists before accessing it
-            return {"response": response_data.get("content", ""), "port": chat.port}
+
+            # OpenAI-compatible format: choices[0].message.content
+            choices = response_data.get("choices", [])
+            if choices:
+                content = choices[0].get("message", {}).get("content", "")
+            else:
+                # Fallback for raw /completion style response
+                content = response_data.get("content", "")
+
+            content = _clean_response(content)
+            return {"response": content, "port": chat.port}
     except httpx.RequestError as e:
         logger.error(f"HTTP request error to server {host}:{chat.port}: {e}")
         raise HTTPException(status_code=503, detail=f"Error communicating with BitNet server on port {chat.port}: {e}")

From 33e3d3163919190590457da44d996c3d375cd177 Mon Sep 17 00:00:00 2001
From: srih0527 <srihari.pinnamaraju@rackspace.com>
Date: Tue, 17 Mar 2026 15:17:31 -0500
Subject: [PATCH 3/3] feat: add volume mounts and uvicorn --reload for
 hot-reloading code changes

---
 docker-compose.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index 0477cc9..c56fce3 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,3 +5,7 @@ services:
     build: .
     ports:
       - "8080:8080"
+    volumes:
+      - ./app/main.py:/code/main.py
+      - ./app/lib:/code/lib
+    command: ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080", "--reload", "--reload-dir", "/code/lib"]