DeepAuto-AI · kbumsik · Oct 12, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/.env.example b/.env.example
@@ -3,8 +3,10 @@ HF_TOKEN=<secret>
 
 # Caches. Optional.
 ## Hugging Face
-HF_HOME="/path/to/your/huggingface/cache"
+HF_HOME=~/.cache/huggingface
 ## DeepGEMM
-SGL_DG_CACHE_DIR=~/.cache/deep_gemm
+SGLANG_DG_CACHE_DIR=~/.cache/deep_gemm
 ## Triton
 TRITON_HOME=~
+## Tilelang
+TILELANG_CACHE_DIR=~/.tilelang/cache
diff --git a/README.md b/README.md
@@ -136,6 +136,29 @@ See the following pages for more details:
 
 - [Running OpenAI API server examples (SGlang)](docs/USAGE.sglang.md)
 
+### Docker Compose
+
+Docker compose examples are available in [`docker-compose`](/docker-compose) folder.
+
+```bash
+# First copy .env.example to .env
+cp .env.example .env
+vim .env
+
+# Start sglang server
+docker compose \
+--env-file .env \
+-f docker-compose/sglang-server.yaml \
+--project-name hip-attention-sglang-server-local \
+up
+
+# Start sglang router
+docker compose \
+-f docker-compose/sglang-router.yaml \
+--project-name hip-attention-sglang-router-local \
+up
+```
+
 ## Experiment Reproduce
 
 Check [how to reproduce experiment](docs/REPRODUCE.md) page
@@ -199,10 +222,39 @@ git clone git@github.com:DeepAuto-AI/hip-attention.git
 cd hip-attention
 docker login
 
-docker build -t deepauto/hip-attention:latest -t deepauto/hip-attention:latest-sglang -t deepauto/hip-attention:$(git rev-parse --short HEAD)-sglang -t deepauto/hip-attention:v$(uv run python -c 'import importlib.metadata; print(importlib.metadata.version("hip-attn"))')-sglang -f Dockerfile.sglang .
+tag_git_short=$(git rev-parse --short HEAD)-sglang
+tag_hip_attention_sglang=v$(uv run python -c 'import importlib.metadata; print(importlib.metadata.version("hip-attn"))')-sglang
+
+# Build sglang server image
+docker build . \
+-f Dockerfile.sglang \
+-t deepauto/hip-attention:latest \
+-t deepauto/hip-attention:latest-sglang \
+-t deepauto/hip-attention:${tag_git_short} \
+-t deepauto/hip-attention:${tag_hip_attention_sglang}
 
+# Publish sglang server image
 docker push deepauto/hip-attention:latest
 docker push deepauto/hip-attention:latest-sglang
-docker push deepauto/hip-attention:$(git rev-parse --short HEAD)-sglang
-docker push deepauto/hip-attention:v$(uv run python -c 'import importlib.metadata; print(importlib.metadata.version("hip-attn"))')-sglang
+docker push deepauto/hip-attention:${tag_git_short}
+docker push deepauto/hip-attention:${tag_hip_attention_sglang}
+
+# Build sglang router image
+cd ../sglang
+
+docker build . \
+-f docker/Dockerfile.router \
+--no-cache \
+-t deepauto/sglang-router:latest \
+-t deepauto/sglang-router:latest-sglang \
+-t deepauto/sglang-router:${tag_git_short} \
+-t deepauto/sglang-router:${tag_hip_attention_sglang}
+
+# Publish sglang router image
+docker push deepauto/sglang-router:latest
+docker push deepauto/sglang-router:latest-sglang
+docker push deepauto/sglang-router:${tag_git_short}
+docker push deepauto/sglang-router:${tag_hip_attention_sglang}
+
+cd -
 ```
diff --git a/docker-compose/sglang-router.yaml b/docker-compose/sglang-router.yaml
@@ -0,0 +1,43 @@
+services:
+  sglang-router:
+    # https://hub.docker.com/r/lmsysorg/sglang-router/tags
+    image: deepauto/sglang-router:v1.2.9-sglang
+    # Host Network to use Tailscale
+    network_mode: host
+    command:
+    - --host
+    - "0.0.0.0"
+    - --port
+    - "10090"
+    # Worker Configuration
+    - --worker-urls
+    - http://h100-80-1:10080
+    # - http://h100-80-2:10080
-    - http://h100-80-1:10080
-    # - http://h100-80-2:10080
+    - ${WORKER_URLS:-http://sglang-worker:10080}
+    # - ${WORKER_URLS_2:-http://sglang-worker-2:10080}
-    - http://h100-80-1:10080
-    # - http://h100-80-2:10080
+    - http://${WORKER_HOSTNAME:-h100-80-1}:10080  # Set WORKER_HOSTNAME env var to override
+    # - http://${WORKER_HOSTNAME_2:-h100-80-2}:10080  # Example for a second worker
-    - http://h100-80-1:10080
-    # - http://h100-80-2:10080
+    - ${WORKER_URLS:-http://sglang-worker:10080}
+    # - ${WORKER_URLS_2:-http://sglang-worker-2:10080}
-    - http://h100-80-1:10080
-    # - http://h100-80-2:10080
+    - http://${WORKER_HOSTNAME:-h100-80-1}:10080  # Set WORKER_HOSTNAME env var to override
+    # - http://${WORKER_HOSTNAME_2:-h100-80-2}:10080  # Example for a second worker
+    - --worker-startup-timeout-secs
+    - "300"
+    # Algorithm
+    - --policy
+    - "cache_aware"
+    # Retry Policy
+    - --retry-backoff-multiplier
+    - "2.0"
+    - --retry-initial-backoff-ms
+    - "100"
+    - --retry-max-backoff-ms
+    - "10000"
+    # Retry for 10 sec * 360 = 3600 sec = 1 hour
+    - --retry-max-retries
+    - "360"
+    # Request timeout for 2 hour
+    - --request-timeout-secs
+    - "7200"
+    # Monitor the router
+    - --prometheus-host
+    - "0.0.0.0"
+    - --prometheus-port
+    - "10091"
+    ports:
+    # Router port
+    - 10090:10090
+    # Monitoring port
+    - 10091:10091
-    ports:
-    # Router port
-    - 10090:10090
-    # Monitoring port
-    - 10091:10091
-    ports:
-    # Router port
-    - 10090:10090
-    # Monitoring port
-    - 10091:10091
diff --git a/docker-compose/sglang-server.yaml b/docker-compose/sglang-server.yaml
@@ -0,0 +1,106 @@
+services:
+  sglang-server:
+    # https://hub.docker.com/r/deepauto/hip-attention/tags
+    image: deepauto/hip-attention:v1.2.9-sglang
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            device_ids: ['0', '1', '2', '3', '4', '5', '6', '7']
+            capabilities: [gpu]
+    volumes:
+    - type: volume
+      source: sglang-cache
+      target: /root/.cache
+    - type: bind
+      source: ${HF_HOME:?error}
-      source: ${HF_HOME:?error}
+      source: ${HF_HOME:?HF_HOME environment variable is required but not set}
-      source: ${HF_HOME:?error}
+      source: ${HF_HOME:?HF_HOME environment variable is required}
-      source: ${HF_HOME:?error}
+      source: ${HF_HOME:?HF_HOME environment variable is required but not set}
-      source: ${HF_HOME:?error}
+      source: ${HF_HOME:?HF_HOME environment variable is required}
+      target: /root/.cache/huggingface
+    command:
+    - python
+    - -m
+    - sglang.launch_server
+    - --host
+    - "0.0.0.0"
+    - --port
+    - "10080"
+    - --model-path
+    - Qwen/Qwen3-235B-A22B-Instruct-2507-FP8
+    - --kv-cache-dtype
+    - auto
+    - --ep-size
+    - "8"
+    - --tp-size
+    - "8"
+    - --chunked-prefill-size
+    - "65536"
+    - --max-prefill-tokens
+    - "65536"
+    - --cuda-graph-bs
+    - "1"
+    - "2"
+    - "4"
+    - "8"
+    - "16"
+    - "24"
+    - "32"
+    - "48"
+    - "64"
+    - "96"
+    - "128"
+    - "160"
+    - "192"
+    - "256"
+    - --context-length
+    - "256000"
+    - --max-total-tokens
+    - "256000"
+    - --attention-backend
+    - hip_attention
+    - --hip-attention-config
+    - ./configs/mixed_landmark_0814_no_extend_qsa.json
+    - --hip-attention-config-override-json
+    - '{"__seq_thresh_fa3": 65536}'
+    - --json-model-override-args
+    - '{"rope_scaling":{"rope_type":"yarn","factor":1.0,"original_max_position_embeddings":262144}, "max_position_embeddings": 262144}'
+    - --max-running-requests
+    - "64"
+    - --trust-remote-code
+    - --tool-call-parser
+    - qwen25
+    environment:
+    - HF_HOME=/root/.cache/huggingface
+    - SGLANG_DG_CACHE_DIR=/root/.cache/deep_gemm
+    - TRITON_HOME=/root/.cache
+    - TILELANG_CACHE_DIR=/root/.cache/tilelang
+    - BSA_K=32
+    - BSA_EXACT_K=32
+    - BSA_BLOCK_K=64
+    - HIP_DEBUG_DELTA_QSA=1
+    - HIP_DEBUG_RECOMPUTE_SPLIT=0
+    - TRITON_PRINT_AUTOTUNING=1
+    - SRT_WARMUP_ALL_SEQ_LENS=0
+    - HIP_DEBUG_FA3_MIXING_LEN=0
+    - PASSKEY_DECODE_LEN=128
+    - PASSKEY_LEN=150
+    - SA_BLOCK_SIZE=128
+    - SA_DECODE_BLOCK_SIZE=128
+    - HIP_DISABLE_AUTOTUNE=0
+    - HIP_DEBUG=0
+    - HIP_DEBUG_BENCH=0
+    - HIP_DEBUG_CAPTURE_DECORATOR=1
+    - CUDA_LAUNCH_BLOCKING=0
+    ports:
+    - 10080:10080
+    restart: always
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:10080/health || exit 1"]
+      interval: 5s
+      timeout: 60s
+      retries: 1
+      start_period: 1800s
+
+volumes:
+  sglang-cache:
+    driver: local
diff --git a/docs/USAGE.sglang.md b/docs/USAGE.sglang.md
@@ -1269,7 +1269,7 @@ python \
 - Tested at: 2025-10-08
 - Tested version:
   - `hip-attention`: `3192b974685791ab08f9278a4e23be4618a227fc`
-  - `sglang` ([DeepAuto-AI/sglang](https://github.com/DeepAuto-AI/sglang)): `a2e22f83f39645d13b40f663ddc7f9fb199f5d13`
+  - `sglang` ([DeepAuto-AI/sglang](https://github.com/DeepAuto-AI/sglang)): `eb1197fd7ad372de83a1589ec99c101054c25cf1`
 
 #### Local
 
@@ -1337,8 +1337,9 @@ docker run \
 --mount type=volume,src=cache-${name}-${version},target=/root/.cache \
 --mount type=bind,source=${HF_HOME:-"$HOME/.cache/huggingface"},target=/root/.cache/huggingface \
 --env "HF_HOME=/root/.cache/huggingface" \
---env "SGL_DG_CACHE_DIR=/root/.cache/deep_gemm" \
+--env "SGLANG_DG_CACHE_DIR=/root/.cache/deep_gemm" \
 --env "TRITON_HOME=/root/.cache" \
+--env "TILELANG_CACHE_DIR=/root/.cache/tilelang" \
 -p ${port}:${port} \
 --ipc=host \
 --health-cmd "curl -f http://localhost:${port}/health || exit 1" \