Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
1aa3935
Initial draft of cursor port
djns99 Nov 18, 2025
24531bc
Properly cache jit compiled module
djns99 Nov 19, 2025
72a46ea
Cleanup prints
djns99 Nov 19, 2025
131f402
Combine tests run without crashing
djns99 Nov 19, 2025
863a1cc
Update tests with fake_moe properly
djns99 Nov 19, 2025
da39075
Clear MOE workspace before each run
djns99 Nov 19, 2025
0a22c8e
Cleanup MPI processes on test failures
djns99 Nov 20, 2025
f22cddd
More exit handling for rank failures
djns99 Nov 20, 2025
fa1cd5b
Cleaner test implementation
djns99 Nov 20, 2025
80de9b6
Update MNNVL config setup
djns99 Nov 26, 2025
5ea1842
Update test to get ep size from MPI
djns99 Nov 26, 2025
589d03c
Update tests with better test bounds
djns99 Nov 26, 2025
a250ddf
Fix timeout logic
djns99 Nov 26, 2025
6656c49
Disable python steps for MPI tests
djns99 Nov 26, 2025
ce239b2
Standardise API name to match existing code better
djns99 Nov 27, 2025
a4602a1
Enhance tests and add convenience APIs for more general usage
djns99 Nov 27, 2025
45d7eae
Fix existing dispatch tests
djns99 Nov 27, 2025
1d54756
Tests for sanitize and combine
djns99 Nov 28, 2025
06b5f34
Fix logic for inplace combine workspace setup
djns99 Nov 28, 2025
d1a182b
Limit num tokens to allow combine to successfully run on 1 GPU
djns99 Nov 28, 2025
52908f9
Unify naming
djns99 Nov 28, 2025
4086053
Add test for payload not in the workspace and fix coderabbit comments
djns99 Nov 28, 2025
e10accf
Update comm.rst
djns99 Nov 28, 2025
28bbc95
Fix coderabbit nits
djns99 Nov 28, 2025
e03a27c
Properly export all functions
djns99 Nov 28, 2025
5ed4ee5
Add A2A single GPU tests to CI
djns99 Dec 3, 2025
c9e9747
Remove internal API from docs
djns99 Dec 3, 2025
d117915
Align workspace calculation to match TRT-LLM style
djns99 Dec 4, 2025
cef5ed7
Add checks for MNNVL support for tests that use the API
djns99 Dec 4, 2025
45fbace
Remove MNNVL from binding names
djns99 Dec 8, 2025
b25cb7c
Rename the flashinfer MOE A2A namespace
djns99 Dec 8, 2025
2791cfe
Fix code rabbit bugs
djns99 Dec 8, 2025
573cff8
Fix workspace allocation to work with non-contiguous rank allocations
djns99 Dec 10, 2025
4c72af6
Update MNNVL tests to use proper workspace allocation
djns99 Dec 10, 2025
a382ae3
Decorate API functions with flashinfer_api decorator
djns99 Dec 10, 2025
f93dd14
Fix import path for flashinfer_api
djns99 Dec 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 34 additions & 6 deletions csrc/nv_internal/cpp/common/envUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,11 +222,6 @@ bool getEnvDisaggLayerwise() {
return disaggLayerwise;
}

bool getEnvParallelCacheSend() {
static bool const parallelCacheSend = getBoolEnv("TRTLLM_PARALLEL_CACHE_SEND");
return parallelCacheSend;
}

bool getEnvRequestKVCacheConcurrent() {
static bool const requestKVCacheConcurrent = getBoolEnv("TRTLLM_REQUEST_KV_CACHE_CONCURRENT");
return requestKVCacheConcurrent;
Expand Down Expand Up @@ -277,7 +272,7 @@ size_t getEnvAllReduceWorkspaceSize() {
return workspaceSize;
}

std::string getEnvKVCacheTransferOutputPath() {
std::string const& getEnvKVCacheTimeOutputPath() {
static std::string outputPath = getStrEnv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH").value_or("");
return outputPath;
}
Expand Down Expand Up @@ -328,4 +323,37 @@ uint16_t getEnvNixlPort() {

bool getEnvDisaggBenchmarkGenOnly() { return getBoolEnv("TRTLLM_DISAGG_BENCHMARK_GEN_ONLY"); }

bool getEnvMoeA2AOneBlockPerToken() {
// Default true; return false only if env set to "0"
static std::optional<int32_t> const val = getIntEnv("TLLM_MOE_A2A_ONE_BLOCK_PER_TOKEN");
if (!val.has_value()) {
return true;
}
return val.value() != 0;
}

static int sanitizeBlockSize(std::optional<int32_t> const& val) {
// Default 256 when not set or invalid
int block = val.value_or(256);
// Clamp to sane CUDA bounds and warp multiples
if (block <= 0) block = 256;
if (block > 1024) block = 1024;
// Round to nearest multiple of 32 (warp size)
block = (block + 31) / 32 * 32;
if (block == 0) block = 256;
return block;
}

int getEnvMoeA2ADispatchBlockSize() {
static int const kBlock = sanitizeBlockSize(getIntEnv("TLLM_MOE_A2A_DISPATCH_BLOCK_SIZE"));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we document env vars too?

Copy link
Author

@djns99 djns99 Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bobboli can you advise? Should we remove these or do you know the cases these should be enabled

return kBlock;
}

int getEnvMoeA2ACombineBlockSize() {
static int const kBlock = sanitizeBlockSize(getIntEnv("TLLM_MOE_A2A_COMBINE_BLOCK_SIZE"));
return kBlock;
}

bool getEnvEplbForceGdrcopy() { return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); }

} // namespace tensorrt_llm::common
11 changes: 10 additions & 1 deletion csrc/nv_internal/tensorrt_llm/common/envUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ bool getEnvDisableKVCacheTransferOverlap();

bool getEnvEnableReceiveKVCacheParallel();

std::string getEnvKVCacheTransferOutputPath();
std::string const& getEnvKVCacheTimeOutputPath();

bool getEnvTryZCopyForKVCacheTransfer();

Expand Down Expand Up @@ -92,4 +92,13 @@ size_t getEnvKVCacheSendMaxConcurrenceNum();

size_t getEnvMemSizeForKVCacheTransferBuffer();

// Whether to use one block per token for MoE A2A kernels (default true).
bool getEnvMoeA2AOneBlockPerToken();

// TODO: For DEV purpose temporarily.
// Block size (threads per block) for MoE A2A Dispatch kernels (default 256 if unset or invalid)
int getEnvMoeA2ADispatchBlockSize();
// Block size (threads per block) for MoE A2A Combine kernels (default 256 if unset or invalid)
int getEnvMoeA2ACombineBlockSize();

} // namespace tensorrt_llm::common
Loading