Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,6 @@ custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_template.h

custom_ops/gpu_ops/wfp8afp8_sparse_gemm/wfp8Afp8_sparse_gemm_*.cu
custom_ops/gpu_ops/wfp8afp8_sparse_gemm/wfp8Afp8_sparse_gemm_template.h

# Bench output artifacts (T53)
runs/
12 changes: 12 additions & 0 deletions benchmarks/yaml/eb45-21b-a3b-32k-bf16-kv50-512s.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# T53 bench workload — KV-bound (not slot-bound); gate: FD_HEAD_WISE_KV_CACHE=1
# max_num_seqs raised to 256 so the KV pool, not the slot count, is the bottleneck.
# kv_cache_ratio: 0.30 → ~24GB KV on A800-80GB (TINY envelope diagnostic per opus v2 verdict).
# (0.35 deterministic OOM at 78.99GB / index 3408/3689 weights load — 4 identical failures.
# 0.50 also OOMs. Revert to 0.35 before SMOKE/FULL only after opus comparability decision.)
# Use with: INPUT_LEN=8192 OUTPUT_LEN=4096 REQUEST_RATE=8
#
max_model_len: 32768
max_num_seqs: 256
kv_cache_ratio: 0.30
tensor_parallel_size: 1
max_num_batched_tokens: 32768
35 changes: 33 additions & 2 deletions custom_ops/gpu_ops/append_attention.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ void AppendAttentionKernel(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& block_tables_headwise,
const paddle::Tensor& encoder_batch_ids,
const paddle::Tensor& encoder_tile_ids_per_batch,
const paddle::Tensor& encoder_num_blocks,
Expand Down Expand Up @@ -96,6 +97,17 @@ void AppendAttentionKernel(
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;

// Dtype guards for Python-supplied INT32 metadata tensors accessed via
// .data<int>() below. Catches accidental INT64/FP dtype before UB.
PD_CHECK(set_max_lengths.dtype() == paddle::DataType::INT32,
"set_max_lengths must be INT32");
PD_CHECK(encoder_num_blocks.dtype() == paddle::DataType::INT32,
"encoder_num_blocks must be INT32");
PD_CHECK(kv_num_blocks.dtype() == paddle::DataType::INT32,
"kv_num_blocks must be INT32");
PD_CHECK(decoder_num_blocks.dtype() == paddle::DataType::INT32,
"decoder_num_blocks must be INT32");

const int max_len_this_time = set_max_lengths.data<int>()[0];
const int max_enc_len_this_time = set_max_lengths.data<int>()[1];
const int max_dec_len_this_time = set_max_lengths.data<int>()[2];
Expand Down Expand Up @@ -155,6 +167,7 @@ void AppendAttentionKernel(
batch_id_per_token,
cu_seqlens_q,
block_tables,
block_tables_headwise,
lambda_batch_ids,
lambda_tile_ids_per_batch,
cache_quant_type_str,
Expand Down Expand Up @@ -488,6 +501,9 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const paddle::optional<paddle::Tensor>& sinks,
const paddle::optional<paddle::Tensor>&
block_tables_headwise, // logical 3D, physical rank-2 [max_num_seqs *
// local_kv_heads, max_blocks_per_head]
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -580,6 +596,8 @@ std::vector<paddle::Tensor> AppendAttention(
}

if (mask_offset) {
PD_CHECK(mask_offset.get().dtype() == paddle::DataType::INT32,
"mask_offset must be INT32");
meta_data.mask_offset = mask_offset.get().data<int>();
}

Expand All @@ -595,6 +613,7 @@ std::vector<paddle::Tensor> AppendAttention(
batch_id_per_token,
cu_seqlens_q,
block_tables,
block_tables_headwise,
encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks,
Expand Down Expand Up @@ -700,6 +719,9 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const paddle::optional<paddle::Tensor>& sinks,
const paddle::optional<paddle::Tensor>&
block_tables_headwise, // logical 3D, physical rank-2 [max_num_seqs *
// local_kv_heads, max_blocks_per_head]
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -738,6 +760,8 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
meta_data.batch_size = seq_lens_this_time.dims()[0];

if (mask_offset) {
PD_CHECK(mask_offset.get().dtype() == paddle::DataType::INT32,
"mask_offset must be INT32");
meta_data.mask_offset = mask_offset.get().data<int>();
}

Expand All @@ -753,6 +777,7 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
batch_id_per_token,
cu_seqlens_q,
block_tables,
block_tables_headwise,
encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks,
Expand Down Expand Up @@ -871,6 +896,7 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& sinks_shape,
const paddle::optional<std::vector<int64_t>>& block_tables_headwise_shape,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -937,6 +963,7 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
const paddle::optional<paddle::DataType>& sinks_dtype,
const paddle::optional<paddle::DataType>& block_tables_headwise_dtype,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -1024,6 +1051,7 @@ std::vector<std::vector<int64_t>> AppendAttentionWithOutputInferShape(
const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& sinks_shape,
const paddle::optional<std::vector<int64_t>>& block_tables_headwise_shape,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -1083,6 +1111,7 @@ std::vector<paddle::DataType> AppendAttentionWithOutputInferDtype(
const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
const paddle::optional<paddle::DataType>& sinks_dtype,
const paddle::optional<paddle::DataType>& block_tables_headwise_dtype,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -1140,7 +1169,8 @@ PD_BUILD_STATIC_OP(append_attention)
paddle::Optional("kv_signal_data"),
paddle::Optional("q_norm_weight"),
paddle::Optional("k_norm_weight"),
paddle::Optional("sinks")})
paddle::Optional("sinks"),
paddle::Optional("block_tables_headwise")})
.Outputs({"fmha_out"})
.Attrs({
"rms_norm_eps: float",
Expand Down Expand Up @@ -1203,7 +1233,8 @@ PD_BUILD_STATIC_OP(append_attention_with_output)
paddle::Optional("kv_signal_data"),
paddle::Optional("q_norm_weight"),
paddle::Optional("k_norm_weight"),
paddle::Optional("sinks")})
paddle::Optional("sinks"),
paddle::Optional("block_tables_headwise")})
.Outputs({"fmha_out_out"})
.SetInplaceMap({{"fmha_out", "fmha_out_out"}})
.Attrs({
Expand Down
8 changes: 8 additions & 0 deletions custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ void CascadeAppendAttentionC16Kernel(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down Expand Up @@ -109,6 +110,7 @@ void CascadeAppendAttentionC16Kernel(
batch_id_per_token,
cu_seqlens_q,
block_table,
block_table_headwise,
batch_ids,
tile_ids_per_batch,
num_blocks,
Expand Down Expand Up @@ -156,6 +158,7 @@ CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::bfloat16>(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down Expand Up @@ -204,6 +207,7 @@ CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down Expand Up @@ -251,6 +255,7 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, int8_t>(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down Expand Up @@ -298,6 +303,7 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float16>(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down Expand Up @@ -346,6 +352,7 @@ CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float8_e4m3fn>(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down Expand Up @@ -393,6 +400,7 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, int8_t>(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
Expand Down
2 changes: 2 additions & 0 deletions custom_ops/gpu_ops/append_attn/append_attention_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ void CascadeAppendAttentionKernel(
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::optional<paddle::Tensor>& block_table_headwise,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const std::string& cache_quant_type_str,
Expand Down Expand Up @@ -86,6 +87,7 @@ void CascadeAppendAttentionKernel(
batch_id_per_token,
cu_seqlens_q,
block_table,
block_table_headwise,
batch_ids,
tile_ids_per_batch,
num_blocks,
Expand Down
Loading
Loading