Skip to content

Commit b5a2947

Browse files
committed
Update banchmark pods
Signed-off-by: dannawang <dannawang@google.com>
1 parent 7d81d90 commit b5a2947

File tree

4 files changed

+25
-7
lines changed

4 files changed

+25
-7
lines changed

examples/offload/gke/benchmarks/deploy-baseline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ spec:
2929
name: hf-token-secret
3030
key: token
3131
- name: SKIP_JAX_PRECOMPILE
32-
value: "1"
32+
value: "0"
3333
ports:
3434
- containerPort: 8000
3535
resources:

examples/offload/gke/benchmarks/deploy-cpu-offload.yaml

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,24 @@ spec:
1616
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
1717
cloud.google.com/gke-tpu-topology: 2x4 # Specify the physical topology for the TPU slice.
1818
initContainers:
19-
- name: increase-vm-max-map-count
19+
- name: tpu-node-setup
2020
image: busybox
21-
# WARNING: This changes the HOST memory settings (vm.max_map_count), not just the container.
22-
# Required to prevent vLLM crashes due to memory mapping limits.
23-
command: ["sysctl", "-w", "vm.max_map_count=1048576"]
21+
command: ["/bin/sh", "-c"]
22+
args:
23+
- |
24+
# WARNING: This changes the HOST memory settings, not just the container.
25+
# Required to prevent vLLM crashes due to memory mapping limits.
26+
sysctl -w vm.max_map_count=8388608
27+
28+
# Check if the VFIO IOMMU module parameter exists, and if so, increase the
29+
# limit on DMA mappings. This allows the TPU driver to pin and map a
30+
# larger number of memory pages for direct hardware access.
31+
if [ -f /sys/module/vfio_iommu_type1/parameters/dma_entry_limit ]; then
32+
echo 2000000 > /sys/module/vfio_iommu_type1/parameters/dma_entry_limit
33+
echo "Successfully increased dma_entry_limit to 2000000"
34+
else
35+
echo "Warning: vfio_iommu_type1 module parameter not found. Ensure the module is loaded."
36+
fi
2437
securityContext:
2538
privileged: true
2639
containers:
@@ -37,7 +50,7 @@ spec:
3750
name: hf-token-secret
3851
key: token
3952
- name: SKIP_JAX_PRECOMPILE
40-
value: "1"
53+
value: "0"
4154
- name: TPU_OFFLOAD_NUM_CPU_CHUNKS
4255
value: "4096"
4356
- name: TPU_OFFLOAD_NUM_STAGING_BLOCKS

tpu_inference/envs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
TPU_OFFLOAD_DECODE_SAVE: bool = False
3030
TPU_OFFLOAD_NUM_CPU_CHUNKS: int = 1024
3131
TPU_OFFLOAD_NUM_STAGING_BLOCKS: int = 128
32+
TPU_OFFLOAD_SAVE_THREADS: int = 1
3233

3334

3435
def env_with_choices(
@@ -142,6 +143,9 @@ def _get_validated_env() -> str | None:
142143
# kv offload to dram: size of staging buffer (hbm) for swap
143144
"TPU_OFFLOAD_NUM_STAGING_BLOCKS":
144145
lambda: int(os.getenv("TPU_OFFLOAD_NUM_STAGING_BLOCKS", "128")),
146+
# kv offload to dram: number of threads for asynchronous TPU -> CPU data transfer
147+
"TPU_OFFLOAD_SAVE_THREADS":
148+
lambda: int(os.getenv("TPU_OFFLOAD_SAVE_THREADS", "1")),
145149
}
146150

147151

tpu_inference/offload/tpu_offload_connector.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1219,8 +1219,9 @@ def __init__(self, vllm_config: VllmConfig,
12191219

12201220
self.cpu_chunk_size = self.block_size
12211221
# Thread pool for asynchronous TPU->CPU copies
1222+
self.num_save_threads = envs.TPU_OFFLOAD_SAVE_THREADS
12221223
self.save_executor = ThreadPoolExecutor(
1223-
max_workers=4, thread_name_prefix="tpu_save_handler")
1224+
max_workers=self.num_save_threads, thread_name_prefix="tpu_save_handler")
12241225
self.finished_save_reqs: set[ReqId] = set()
12251226
self.finished_load_reqs: set[ReqId] = set()
12261227
# Tracks if wait_for_save has been called for the current step's metadata.

0 commit comments

Comments
 (0)