Update banchmark pods

dannawang0221 · dannawang0221 · commit b5a294714f99 · 2025-12-17T00:20:36.000Z
Signed-off-by: dannawang &lt;dannawang@google.com&gt;
diff --git a/examples/offload/gke/benchmarks/deploy-baseline.yaml b/examples/offload/gke/benchmarks/deploy-baseline.yaml
@@ -29,7 +29,7 @@ spec:
               name: hf-token-secret
               key: token
         - name: SKIP_JAX_PRECOMPILE
-          value: "1"
+          value: "0"
         ports:
         - containerPort: 8000
         resources:
diff --git a/examples/offload/gke/benchmarks/deploy-cpu-offload.yaml b/examples/offload/gke/benchmarks/deploy-cpu-offload.yaml
@@ -16,11 +16,24 @@ spec:
         cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
         cloud.google.com/gke-tpu-topology: 2x4 # Specify the physical topology for the TPU slice.
       initContainers:
-        - name: increase-vm-max-map-count
+        - name: tpu-node-setup
           image: busybox
-          # WARNING: This changes the HOST memory settings (vm.max_map_count), not just the container.
-          # Required to prevent vLLM crashes due to memory mapping limits.
-          command: ["sysctl", "-w", "vm.max_map_count=1048576"]
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              # WARNING: This changes the HOST memory settings, not just the container.
+              # Required to prevent vLLM crashes due to memory mapping limits.
+              sysctl -w vm.max_map_count=8388608
+
+              # Check if the VFIO IOMMU module parameter exists, and if so, increase the
+              # limit on DMA mappings. This allows the TPU driver to pin and map a
+              # larger number of memory pages for direct hardware access.
+              if [ -f /sys/module/vfio_iommu_type1/parameters/dma_entry_limit ]; then
+                echo 2000000 > /sys/module/vfio_iommu_type1/parameters/dma_entry_limit
+                echo "Successfully increased dma_entry_limit to 2000000"
+              else
+                echo "Warning: vfio_iommu_type1 module parameter not found. Ensure the module is loaded."
+              fi
           securityContext:
             privileged: true
       containers:
@@ -37,7 +50,7 @@ spec:
               name: hf-token-secret
               key: token
         - name: SKIP_JAX_PRECOMPILE
-          value: "1"
+          value: "0"
         - name: TPU_OFFLOAD_NUM_CPU_CHUNKS
           value: "4096"
         - name: TPU_OFFLOAD_NUM_STAGING_BLOCKS
diff --git a/tpu_inference/envs.py b/tpu_inference/envs.py
@@ -29,6 +29,7 @@
     TPU_OFFLOAD_DECODE_SAVE: bool = False
     TPU_OFFLOAD_NUM_CPU_CHUNKS: int = 1024
     TPU_OFFLOAD_NUM_STAGING_BLOCKS: int = 128
+    TPU_OFFLOAD_SAVE_THREADS: int = 1
 
 
 def env_with_choices(
@@ -142,6 +143,9 @@ def _get_validated_env() -> str | None:
     # kv offload to dram: size of staging buffer (hbm) for swap
     "TPU_OFFLOAD_NUM_STAGING_BLOCKS":
     lambda: int(os.getenv("TPU_OFFLOAD_NUM_STAGING_BLOCKS", "128")),
+    # kv offload to dram: number of threads for asynchronous TPU -> CPU data transfer
+    "TPU_OFFLOAD_SAVE_THREADS":
+    lambda: int(os.getenv("TPU_OFFLOAD_SAVE_THREADS", "1")),
 }
 
 
diff --git a/tpu_inference/offload/tpu_offload_connector.py b/tpu_inference/offload/tpu_offload_connector.py
@@ -1219,8 +1219,9 @@ def __init__(self, vllm_config: VllmConfig,
 
         self.cpu_chunk_size = self.block_size
         # Thread pool for asynchronous TPU->CPU copies
+        self.num_save_threads = envs.TPU_OFFLOAD_SAVE_THREADS
         self.save_executor = ThreadPoolExecutor(
-            max_workers=4, thread_name_prefix="tpu_save_handler")
+            max_workers=self.num_save_threads, thread_name_prefix="tpu_save_handler")
         self.finished_save_reqs: set[ReqId] = set()
         self.finished_load_reqs: set[ReqId] = set()
         # Tracks if wait_for_save has been called for the current step's metadata.