From 3704bdaacbec6032cf41a1d8d600057ef4c1503a Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Tue, 13 Jan 2026 19:37:47 +0000 Subject: [PATCH 1/2] Update readme for multi-gpu setting --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4547a0b..1df884e 100644 --- a/README.md +++ b/README.md @@ -61,10 +61,13 @@ mkdir -p /data/flowsim-simulate #### Step 1: Profile (Generate Traces) ```bash -sudo docker run --rm --gpus=all \ +sudo docker run --gpus=all \ -v /data/flowsim-profile:/workspace/profile \ -v /data/flowsim-simulate:/workspace/simulate \ -w /flowsim \ + --cap-add=SYS_ADMIN \ + --network=host \ + --shm-size 911G \ flowsim-image \ python scripts/run_profile.py \ --profile-dir /workspace/profile \ From bec0d2bb5aea3fe845c48a54f7379ac6f342731e Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Tue, 13 Jan 2026 21:15:43 +0000 Subject: [PATCH 2/2] Add timeout config and update tp/ep size --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1df884e..c6ae78c 100644 --- a/README.md +++ b/README.md @@ -72,8 +72,9 @@ sudo docker run --gpus=all \ python scripts/run_profile.py \ --profile-dir /workspace/profile \ --log-dir /workspace/profile/logs \ - --server-opts "--model-path /flowsim/workload/models/configs/deepseek/ --load-format dummy --tp 1 --host 0.0.0.0 --port 30001 --attention-backend flashinfer --disable-cuda-graph" \ - --bench-opts "--backend sglang --host 0.0.0.0 --port 30001 --dataset-name defined-len --prefill-decode-lens 32768:8 --num-prompts 16 --profile" + --bench-timeout 3600 \ + --server-opts "--model-path /flowsim/workload/models/configs/deepseek/ --load-format dummy --tp 4 --ep 4 --host 0.0.0.0 --port 30001 --attention-backend flashinfer --disable-cuda-graph" \ + --bench-opts "--backend sglang --host 0.0.0.0 --port 30001 --dataset-name defined-len --prefill-decode-lens 1024:8 --num-prompts 1 --profile" ``` **What this does:** @@ -81,7 +82,7 @@ sudo docker run --gpus=all \ - Runs benchmark requests against it - Generates `*.trace.json.gz` files in `/data/flowsim-profile` (mounted as `/workspace/profile`) -**Note:** The first run will be slow (~10 minutes) due to DeepGEMM kernel warmup and compilation. For stable performance, avoid using `--rm` flag and reuse the same container. Subsequent runs with similar configurations will be faster. +**Note:** The first run will be slow (~10 minutes) due to DeepGEMM kernel warmup and compilation. For stable performance, avoid using `--rm` flag and reuse the same container using `sudo docker exec -it bash`. Subsequent runs with similar configurations will be faster. **Tip:** - Adjust `--server-opts` and `--bench-opts` to match your model, parallelism (TP/DP/EP), and workload requirements. All `sglang.launch_server` and `bench_serving.py` parameters are supported.