From 81eb61b97d03b118cf38008fa0de526a2146708e Mon Sep 17 00:00:00 2001 From: srih0527 Date: Tue, 17 Mar 2026 14:33:04 -0500 Subject: [PATCH 1/3] fix: Dockerfile build issues and add docker-compose + Postman collection - Fix apt package issues (remove software-properties-common, lsb-release) - Patch const-correctness error in ggml-bitnet-mad.cpp for clang - Use pre-built GGUF model to skip broken HF-to-GGUF conversion - Add docker-compose.yml for easy container management - Add Postman collection covering all 19 API endpoints --- Dockerfile | 18 +- FastAPI-BitNet.postman_collection.json | 406 +++++++++++++++++++++++++ docker-compose.yml | 7 + 3 files changed, 425 insertions(+), 6 deletions(-) create mode 100644 FastAPI-BitNet.postman_collection.json create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile index 1ab39dc..b2ba007 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,23 +10,29 @@ RUN git clone --recursive https://github.com/microsoft/BitNet.git /tmp/BitNet && rm -rf /tmp/BitNet # Install dependencies -RUN apt-get update && apt-get install -y \ +RUN apt-get update --fix-missing && apt-get install -y --no-install-recommends \ wget \ - lsb-release \ - software-properties-common \ gnupg \ cmake \ clang \ - && bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" \ + ca-certificates \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +# Patch const-correctness error in BitNet source (clang is stricter than gcc) +RUN sed -i 's/int8_t \* y_col = y + col \* by;/const int8_t * y_col = y + col * by;/' /code/src/ggml-bitnet-mad.cpp + # Install Python dependencies RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt && \ pip install "fastapi[standard]" "uvicorn[standard]" httpx fastapi-mcp psutil -# (Optional) Run your setup_env.py if needed -RUN python /code/setup_env.py -md /code/models/BitNet-b1.58-2B-4T -q i2_s +# Download pre-built GGUF model (skips the broken HF-to-GGUF conversion) +# Use the exact model name "BitNet-b1.58-2B-4T" so setup_env.py recognizes it +RUN huggingface-cli download microsoft/bitnet-b1.58-2B-4T-gguf --local-dir /code/models/BitNet-b1.58-2B-4T + +# Run setup (compile + codegen, model already has the gguf so conversion is skipped) +RUN python /code/setup_env.py -md /code/models/BitNet-b1.58-2B-4T -q i2_s 2>&1 \ + || (cat /code/logs/*.log 2>/dev/null; exit 1) EXPOSE 8080 diff --git a/FastAPI-BitNet.postman_collection.json b/FastAPI-BitNet.postman_collection.json new file mode 100644 index 0000000..ed15e49 --- /dev/null +++ b/FastAPI-BitNet.postman_collection.json @@ -0,0 +1,406 @@ +{ + "info": { + "name": "FastAPI BitNet Orchestrator", + "description": "API collection for managing and interacting with BitNet llama server & CLI instances.", + "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json" + }, + "variable": [ + { + "key": "baseUrl", + "value": "http://localhost:8080", + "type": "string" + } + ], + "item": [ + { + "name": "Server Management", + "item": [ + { + "name": "Estimate Max BitNet Servers", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/estimate?per_server_gb=1.5", + "host": ["{{baseUrl}}"], + "path": ["estimate"], + "query": [ + { + "key": "per_server_gb", + "value": "1.5", + "description": "Estimated RAM usage per server in GB" + } + ] + }, + "description": "Estimate the maximum number of BitNet server instances based on RAM and CPU threads." + } + }, + { + "name": "Initialize Server", + "request": { + "method": "POST", + "header": [], + "url": { + "raw": "{{baseUrl}}/initialize-server?threads=2&ctx_size=2048&port=8081&system_prompt=You are a helpful assistant.&n_predict=4096&temperature=0.8", + "host": ["{{baseUrl}}"], + "path": ["initialize-server"], + "query": [ + { "key": "threads", "value": "2", "description": "Number of threads" }, + { "key": "ctx_size", "value": "2048", "description": "Context size in tokens" }, + { "key": "port", "value": "8081", "description": "Port for the server instance" }, + { "key": "system_prompt", "value": "You are a helpful assistant.", "description": "System prompt" }, + { "key": "n_predict", "value": "4096", "description": "Max tokens to predict" }, + { "key": "temperature", "value": "0.8", "description": "Sampling temperature" } + ] + }, + "description": "Initialize and start a single BitNet llama-server process." + } + }, + { + "name": "Shutdown Server", + "request": { + "method": "POST", + "header": [], + "url": { + "raw": "{{baseUrl}}/shutdown-server?port=8081", + "host": ["{{baseUrl}}"], + "path": ["shutdown-server"], + "query": [ + { "key": "port", "value": "8081", "description": "Port of the server to shut down" } + ] + }, + "description": "Shut down a specific BitNet server instance by port." + } + }, + { + "name": "Get Server Status", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/server-status?port=8081", + "host": ["{{baseUrl}}"], + "path": ["server-status"], + "query": [ + { "key": "port", "value": "8081", "description": "Port of the server to check" } + ] + }, + "description": "Get the status of a specific BitNet server instance." + } + } + ] + }, + { + "name": "Batch Server Management", + "item": [ + { + "name": "Initialize Batch Servers", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"servers\": [\n {\n \"threads\": 1,\n \"ctx_size\": 2048,\n \"port\": 8081,\n \"system_prompt\": \"You are a helpful assistant.\",\n \"n_predict\": 256,\n \"temperature\": 0.8\n },\n {\n \"threads\": 1,\n \"ctx_size\": 2048,\n \"port\": 8082,\n \"system_prompt\": \"You are a coding assistant.\",\n \"n_predict\": 512,\n \"temperature\": 0.6\n }\n ]\n}" + }, + "url": { + "raw": "{{baseUrl}}/initialize-batch-servers", + "host": ["{{baseUrl}}"], + "path": ["initialize-batch-servers"] + }, + "description": "Initialize multiple BitNet server instances in one request." + } + }, + { + "name": "Shutdown Batch Servers", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"ports\": [8081, 8082]\n}" + }, + "url": { + "raw": "{{baseUrl}}/shutdown-batch-servers", + "host": ["{{baseUrl}}"], + "path": ["shutdown-batch-servers"] + }, + "description": "Shut down multiple BitNet server instances." + } + }, + { + "name": "Batch Server Status", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"ports\": [8081, 8082]\n}" + }, + "url": { + "raw": "{{baseUrl}}/batch-server-status", + "host": ["{{baseUrl}}"], + "path": ["batch-server-status"] + }, + "description": "Get the status of multiple BitNet server instances." + } + } + ] + }, + { + "name": "Interaction", + "item": [ + { + "name": "Chat with BitNet Server", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"message\": \"Hello, what can you do?\",\n \"port\": 8081,\n \"n_predict\": 256,\n \"temperature\": 0.8\n}" + }, + "url": { + "raw": "{{baseUrl}}/chat", + "host": ["{{baseUrl}}"], + "path": ["chat"] + }, + "description": "Send a chat message to a running BitNet server instance." + } + }, + { + "name": "Multichat with BitNet Servers", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"requests\": [\n {\n \"message\": \"What is 2+2?\",\n \"port\": 8081,\n \"n_predict\": 128,\n \"temperature\": 0.5\n },\n {\n \"message\": \"Tell me a joke.\",\n \"port\": 8082,\n \"n_predict\": 256,\n \"temperature\": 1.0\n }\n ]\n}" + }, + "url": { + "raw": "{{baseUrl}}/multichat", + "host": ["{{baseUrl}}"], + "path": ["multichat"] + }, + "description": "Send multiple chat messages to BitNet servers concurrently." + } + } + ] + }, + { + "name": "Llama CLI Management", + "item": [ + { + "name": "Initialize Llama CLI Session", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"cli_alias\": \"bitnet_001\",\n \"threads\": 2,\n \"ctx_size\": 2048,\n \"n_predict\": 256,\n \"temperature\": 0.8,\n \"system_prompt\": \"You are a helpful assistant.\"\n}" + }, + "url": { + "raw": "{{baseUrl}}/initialize-llama-cli", + "host": ["{{baseUrl}}"], + "path": ["initialize-llama-cli"] + }, + "description": "Start a persistent llama-cli process in conversational mode." + } + }, + { + "name": "Initialize Batch Llama CLI Sessions", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"requests\": [\n {\n \"cli_alias\": \"session_a\",\n \"threads\": 1,\n \"ctx_size\": 2048,\n \"n_predict\": 256,\n \"temperature\": 0.8\n },\n {\n \"cli_alias\": \"session_b\",\n \"threads\": 1,\n \"ctx_size\": 2048,\n \"n_predict\": 512,\n \"temperature\": 0.6\n }\n ]\n}" + }, + "url": { + "raw": "{{baseUrl}}/initialize-batch-llama-cli", + "host": ["{{baseUrl}}"], + "path": ["initialize-batch-llama-cli"] + }, + "description": "Start multiple persistent llama-cli sessions in one request." + } + }, + { + "name": "Shutdown Llama CLI Session", + "request": { + "method": "POST", + "header": [], + "url": { + "raw": "{{baseUrl}}/shutdown-llama-cli/bitnet_001", + "host": ["{{baseUrl}}"], + "path": ["shutdown-llama-cli", "bitnet_001"] + }, + "description": "Stop a specific persistent llama-cli session by alias." + } + }, + { + "name": "Shutdown Batch Llama CLI Sessions", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"aliases\": [\"session_a\", \"session_b\"]\n}" + }, + "url": { + "raw": "{{baseUrl}}/shutdown-batch-llama-cli", + "host": ["{{baseUrl}}"], + "path": ["shutdown-batch-llama-cli"] + }, + "description": "Stop multiple persistent llama-cli sessions." + } + }, + { + "name": "Get Llama CLI Session Status", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/llama-cli-status/bitnet_001", + "host": ["{{baseUrl}}"], + "path": ["llama-cli-status", "bitnet_001"] + }, + "description": "Get the status of a specific llama-cli session." + } + }, + { + "name": "Batch Llama CLI Session Status", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"aliases\": [\"session_a\", \"session_b\"]\n}" + }, + "url": { + "raw": "{{baseUrl}}/batch-llama-cli-status", + "host": ["{{baseUrl}}"], + "path": ["batch-llama-cli-status"] + }, + "description": "Get the status of multiple llama-cli sessions." + } + } + ] + }, + { + "name": "Llama CLI Interaction", + "item": [ + { + "name": "Chat with Llama CLI Session", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"cli_alias\": \"bitnet_001\",\n \"prompt\": \"What is the capital of France?\"\n}" + }, + "url": { + "raw": "{{baseUrl}}/chat-llama-cli", + "host": ["{{baseUrl}}"], + "path": ["chat-llama-cli"] + }, + "description": "Send a prompt to a running persistent llama-cli session." + } + }, + { + "name": "Batch Chat with Llama CLI Sessions", + "request": { + "method": "POST", + "header": [ + { "key": "Content-Type", "value": "application/json" } + ], + "body": { + "mode": "raw", + "raw": "{\n \"requests\": [\n {\n \"cli_alias\": \"session_a\",\n \"prompt\": \"What is 2+2?\"\n },\n {\n \"cli_alias\": \"session_b\",\n \"prompt\": \"Tell me a joke.\"\n }\n ]\n}" + }, + "url": { + "raw": "{{baseUrl}}/batch-chat-llama-cli", + "host": ["{{baseUrl}}"], + "path": ["batch-chat-llama-cli"] + }, + "description": "Send multiple chat prompts to llama-cli sessions concurrently." + } + } + ] + }, + { + "name": "Utilities", + "item": [ + { + "name": "Run Benchmark", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/benchmark?model=BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf&n_token=128&threads=2&n_prompt=32", + "host": ["{{baseUrl}}"], + "path": ["benchmark"], + "query": [ + { "key": "model", "value": "BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf", "description": "Model enum value" }, + { "key": "n_token", "value": "128", "description": "Number of tokens to process" }, + { "key": "threads", "value": "2", "description": "Number of threads" }, + { "key": "n_prompt", "value": "32", "description": "Number of prompt tokens" } + ] + }, + "description": "Run a performance benchmark on a BitNet model." + } + }, + { + "name": "Calculate Perplexity", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/perplexity?model=BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf&prompt=The quick brown fox jumps over the lazy dog and then runs across the field to find more food for the winter season ahead&threads=2&ctx_size=10&ppl_stride=0", + "host": ["{{baseUrl}}"], + "path": ["perplexity"], + "query": [ + { "key": "model", "value": "BitNet_b1_58_2B_4T_ggml_model_i2_s_gguf", "description": "Model enum value" }, + { "key": "prompt", "value": "The quick brown fox jumps over the lazy dog and then runs across the field to find more food for the winter season ahead", "description": "Input text" }, + { "key": "threads", "value": "2" }, + { "key": "ctx_size", "value": "10" }, + { "key": "ppl_stride", "value": "0" } + ] + }, + "description": "Calculate perplexity of text using a BitNet model." + } + }, + { + "name": "Get Model Sizes", + "request": { + "method": "GET", + "header": [], + "url": { + "raw": "{{baseUrl}}/model-sizes", + "host": ["{{baseUrl}}"], + "path": ["model-sizes"] + }, + "description": "Get file sizes of all available .gguf model files." + } + } + ] + } + ] +} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..0477cc9 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,7 @@ +version: "3.8" + +services: + bitnet-api: + build: . + ports: + - "8080:8080" From d407312ea861f7c834b0e6872e32f8fe4f2cbbb1 Mon Sep 17 00:00:00 2001 From: srih0527 Date: Tue, 17 Mar 2026 15:01:40 -0500 Subject: [PATCH 2/3] fix: use /v1/chat/completions with LLaMA 3 chat template for clean responses --- app/lib/endpoints/chat_endpoints.py | 48 ++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/app/lib/endpoints/chat_endpoints.py b/app/lib/endpoints/chat_endpoints.py index df94ff0..495cf4f 100644 --- a/app/lib/endpoints/chat_endpoints.py +++ b/app/lib/endpoints/chat_endpoints.py @@ -21,7 +21,7 @@ class ChatRequest(BaseModel): port: int = 8081 threads: int = Field(default_factory=lambda: os.cpu_count() or 1, gt=0) ctx_size: int = Field(default=2048, gt=0) - n_predict: int = Field(default=256, gt=0) + n_predict: int = Field(default=128, gt=0) temperature: float = Field(default=0.8, gt=0.0, le=2.0) @@ -35,6 +35,26 @@ class MultiChatRequest(BaseModel): # --- Endpoint logic functions --- +def _clean_response(text: str) -> str: + """Strip repetitive or meta-text patterns from raw model output.""" + import re + # Take only the first meaningful answer block + # Stop at patterns like "Question:", "Input:", "Output:", "(no answer)", "(end of answer)" + cut_patterns = [ + r'\n\s*Question\s*:', + r'\n\s*Input\s*:', + r'\n\s*Output\s*:', + r'\(no answer\)', + r'\(end of answer\)', + r'\(No answer required\)', + ] + for pat in cut_patterns: + m = re.search(pat, text, re.IGNORECASE) + if m: + text = text[:m.start()] + return text.strip() + + async def handle_chat_with_bitnet_server(chat: ChatRequest): host = "127.0.0.1" key = (host, chat.port) @@ -43,11 +63,20 @@ async def handle_chat_with_bitnet_server(chat: ChatRequest): if not (proc_entry and proc_entry["process"].returncode is None and cfg): logger.warning(f"Chat request to non-existent or stopped server on port {chat.port}.") raise HTTPException(status_code=404, detail=f"Server on port {chat.port} not running or not configured.") - server_url = f"http://{host}:{chat.port}/completion" + + # Use the OpenAI-compatible chat completions endpoint so llama-server + # applies the correct LLaMA 3 chat template automatically. + server_url = f"http://{host}:{chat.port}/v1/chat/completions" + + system_prompt = cfg.get("system_prompt", "You are a helpful assistant.") payload = { - "prompt": chat.message, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": chat.message}, + ], "n_predict": chat.n_predict, "temperature": chat.temperature, + "stop": ["\nQuestion:", "\nInput:", "\nOutput:", "\n\nQuestion:", "\n\nInput:"], } try: @@ -55,8 +84,17 @@ async def handle_chat_with_bitnet_server(chat: ChatRequest): response = await client.post(server_url, json=payload, timeout=60.0) response.raise_for_status() response_data = response.json() - # Ensure the key "content" exists before accessing it - return {"response": response_data.get("content", ""), "port": chat.port} + + # OpenAI-compatible format: choices[0].message.content + choices = response_data.get("choices", []) + if choices: + content = choices[0].get("message", {}).get("content", "") + else: + # Fallback for raw /completion style response + content = response_data.get("content", "") + + content = _clean_response(content) + return {"response": content, "port": chat.port} except httpx.RequestError as e: logger.error(f"HTTP request error to server {host}:{chat.port}: {e}") raise HTTPException(status_code=503, detail=f"Error communicating with BitNet server on port {chat.port}: {e}") From 33e3d3163919190590457da44d996c3d375cd177 Mon Sep 17 00:00:00 2001 From: srih0527 Date: Tue, 17 Mar 2026 15:17:31 -0500 Subject: [PATCH 3/3] feat: add volume mounts and uvicorn --reload for hot-reloading code changes --- docker-compose.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 0477cc9..c56fce3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,3 +5,7 @@ services: build: . ports: - "8080:8080" + volumes: + - ./app/main.py:/code/main.py + - ./app/lib:/code/lib + command: ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080", "--reload", "--reload-dir", "/code/lib"]