flashinfer-ai · djns99 · Nov 18, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
@@ -222,11 +222,6 @@ bool getEnvDisaggLayerwise() {
   return disaggLayerwise;
 }
 
-bool getEnvParallelCacheSend() {
-  static bool const parallelCacheSend = getBoolEnv("TRTLLM_PARALLEL_CACHE_SEND");
-  return parallelCacheSend;
-}
-
 bool getEnvRequestKVCacheConcurrent() {
   static bool const requestKVCacheConcurrent = getBoolEnv("TRTLLM_REQUEST_KV_CACHE_CONCURRENT");
   return requestKVCacheConcurrent;
@@ -277,7 +272,7 @@ size_t getEnvAllReduceWorkspaceSize() {
   return workspaceSize;
 }
 
-std::string getEnvKVCacheTransferOutputPath() {
+std::string const& getEnvKVCacheTimeOutputPath() {
   static std::string outputPath = getStrEnv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH").value_or("");
   return outputPath;
 }
@@ -328,4 +323,37 @@ uint16_t getEnvNixlPort() {
 
 bool getEnvDisaggBenchmarkGenOnly() { return getBoolEnv("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY"); }
 
+bool getEnvMoeA2AOneBlockPerToken() {
+  // Default true; return false only if env set to "0"
+  static std::optional<int32_t> const val = getIntEnv("TLLM_MOE_A2A_ONE_BLOCK_PER_TOKEN");
+  if (!val.has_value()) {
+    return true;
+  }
+  return val.value() != 0;
+}
+
+static int sanitizeBlockSize(std::optional<int32_t> const& val) {
+  // Default 256 when not set or invalid
+  int block = val.value_or(256);
+  // Clamp to sane CUDA bounds and warp multiples
+  if (block <= 0) block = 256;
+  if (block > 1024) block = 1024;
+  // Round to nearest multiple of 32 (warp size)
+  block = (block + 31) / 32 * 32;
+  if (block == 0) block = 256;
+  return block;
+}
+
+int getEnvMoeA2ADispatchBlockSize() {
+  static int const kBlock = sanitizeBlockSize(getIntEnv("TLLM_MOE_A2A_DISPATCH_BLOCK_SIZE"));
+  return kBlock;
+}
+
+int getEnvMoeA2ACombineBlockSize() {
+  static int const kBlock = sanitizeBlockSize(getIntEnv("TLLM_MOE_A2A_COMBINE_BLOCK_SIZE"));
+  return kBlock;
+}
+
+bool getEnvEplbForceGdrcopy() { return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); }
+
 }  // namespace tensorrt_llm::common
@@ -64,7 +64,7 @@ bool getEnvDisableKVCacheTransferOverlap();
 
 bool getEnvEnableReceiveKVCacheParallel();
 
-std::string getEnvKVCacheTransferOutputPath();
+std::string const& getEnvKVCacheTimeOutputPath();
 
 bool getEnvTryZCopyForKVCacheTransfer();
 
@@ -92,4 +92,13 @@ size_t getEnvKVCacheSendMaxConcurrenceNum();
 
 size_t getEnvMemSizeForKVCacheTransferBuffer();
 
+// Whether to use one block per token for MoE A2A kernels (default true).
+bool getEnvMoeA2AOneBlockPerToken();
+
+// TODO: For DEV purpose temporarily.
+// Block size (threads per block) for MoE A2A Dispatch kernels (default 256 if unset or invalid)
+int getEnvMoeA2ADispatchBlockSize();
+// Block size (threads per block) for MoE A2A Combine kernels (default 256 if unset or invalid)
+int getEnvMoeA2ACombineBlockSize();
+
 }  // namespace tensorrt_llm::common