From 3704bdaacbec6032cf41a1d8d600057ef4c1503a Mon Sep 17 00:00:00 2001
From: Tao Zhang <zhangt@microsoft.com>
Date: Tue, 13 Jan 2026 19:37:47 +0000
Subject: [PATCH 1/2] Update readme for multi-gpu setting

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4547a0b..1df884e 100644
--- a/README.md
+++ b/README.md
@@ -61,10 +61,13 @@ mkdir -p /data/flowsim-simulate
 #### Step 1: Profile (Generate Traces)
 
 ```bash
-sudo docker run --rm --gpus=all \
+sudo docker run --gpus=all \
   -v /data/flowsim-profile:/workspace/profile \
   -v /data/flowsim-simulate:/workspace/simulate \
   -w /flowsim \
+  --cap-add=SYS_ADMIN \
+  --network=host \
+  --shm-size 911G \
   flowsim-image \
   python scripts/run_profile.py \
     --profile-dir /workspace/profile \

From bec0d2bb5aea3fe845c48a54f7379ac6f342731e Mon Sep 17 00:00:00 2001
From: Tao Zhang <zhangt@microsoft.com>
Date: Tue, 13 Jan 2026 21:15:43 +0000
Subject: [PATCH 2/2] Add timeout config and update tp/ep size

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1df884e..c6ae78c 100644
--- a/README.md
+++ b/README.md
@@ -72,8 +72,9 @@ sudo docker run --gpus=all \
   python scripts/run_profile.py \
     --profile-dir /workspace/profile \
     --log-dir /workspace/profile/logs \
-    --server-opts "--model-path /flowsim/workload/models/configs/deepseek/ --load-format dummy --tp 1 --host 0.0.0.0 --port 30001 --attention-backend flashinfer --disable-cuda-graph" \
-    --bench-opts "--backend sglang --host 0.0.0.0 --port 30001 --dataset-name defined-len --prefill-decode-lens 32768:8 --num-prompts 16 --profile"
+    --bench-timeout 3600 \
+    --server-opts "--model-path /flowsim/workload/models/configs/deepseek/ --load-format dummy --tp 4 --ep 4 --host 0.0.0.0 --port 30001 --attention-backend flashinfer --disable-cuda-graph" \
+    --bench-opts "--backend sglang --host 0.0.0.0 --port 30001 --dataset-name defined-len --prefill-decode-lens 1024:8 --num-prompts 1 --profile"
 ```
 
 **What this does:**
@@ -81,7 +82,7 @@ sudo docker run --gpus=all \
 - Runs benchmark requests against it
 - Generates `*.trace.json.gz` files in `/data/flowsim-profile` (mounted as `/workspace/profile`)
 
-**Note:** The first run will be slow (~10 minutes) due to DeepGEMM kernel warmup and compilation. For stable performance, avoid using `--rm` flag and reuse the same container. Subsequent runs with similar configurations will be faster.
+**Note:** The first run will be slow (~10 minutes) due to DeepGEMM kernel warmup and compilation. For stable performance, avoid using `--rm` flag and reuse the same container using `sudo docker exec -it <container_id> bash`. Subsequent runs with similar configurations will be faster.
 
 **Tip:** 
 - Adjust `--server-opts` and `--bench-opts` to match your model, parallelism (TP/DP/EP), and workload requirements. All `sglang.launch_server` and `bench_serving.py` parameters are supported.