Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
0f8148b
[deps] update llm.kernel.cuda top internal
Alcanderian May 8, 2024
1492147
[feature] update db.mhca kernel interface
Alcanderian May 8, 2024
f8ac45b
[opt] optimize i8i8 fuse
Alcanderian May 9, 2024
2c5bd05
[feature] enable i4f16 online quantize
Alcanderian May 9, 2024
51aee56
[feature] add dep version variables
ouonline May 14, 2024
672e497
[fix] do not override `PPLNN_DEP_PROTOBUF_VERSION` so that we can use…
ouonline May 16, 2024
8096316
[deps] change ppl.llm.kernel.cuda to internal-master
Alcanderian May 22, 2024
db7fa63
[feature] add alibislope gen
Alcanderian May 16, 2024
7c939eb
[opt] update mhca interface
Alcanderian May 17, 2024
914d9c7
[feture] reserve old benchmark for compare
Alcanderian May 17, 2024
ae48659
[refactor] update mhca api
Alcanderian May 20, 2024
c64686c
[fix] don't call virtual function in destructor
ouonline May 27, 2024
ad83776
[feature] add decoding attn engine option
Alcanderian Jun 3, 2024
50e70ac
[feature][cuda]add vision embedding to llm.
Apr 17, 2024
aa80986
[feature]add cudnn support in cmake/cmakelists.txt.
May 14, 2024
ad27faa
[opt]swith cudnn dependency to ppl.llm.kernel.cuda.
May 17, 2024
0c5dc3c
[feature][cuda]add vision embedding to llm.
Apr 17, 2024
a5d7faa
[feature]add cudnn support in cmake/cmakelists.txt.
May 14, 2024
6ee0ca9
[opt]swith cudnn dependency to ppl.llm.kernel.cuda.
May 17, 2024
f9cb71d
[opt]add cudnn&VisionEmbedding micro.
Jun 5, 2024
69cbdf0
[fix] minor fix llm_cuda_device.h
Alcanderian Jun 21, 2024
de88f5e
[feature][cuda]add vision embedding to llm.
Apr 17, 2024
3ff4f81
[feature]add cudnn support in cmake/cmakelists.txt.
May 14, 2024
1bc2652
[opt]swith cudnn dependency to ppl.llm.kernel.cuda.
May 17, 2024
4451188
[feature] suport fp8
Sep 4, 2024
92e3a65
fix review comment
Sep 6, 2024
fa89d9f
[feature] support pixel unshuffle
yimmmin Jul 19, 2024
a27d806
[feature] support tensor parallel rms norm
yimmmin Aug 20, 2024
33d8c4c
[feature] support cache prefill attention
yimmmin Sep 4, 2024
90226be
[fix] fix cache_prefill_flag val
yimmmin Sep 20, 2024
cdc232c
[log] change cache prefill log to debug
Oct 8, 2024
07e9100
[release] llm_v2
Alcanderian Oct 28, 2024
5f53615
[fix] fix deps
Alcanderian Nov 5, 2024
d7ec72b
[fix] fix online_cast_kernel.h
Alcanderian Nov 7, 2024
482bfe7
[deps] update deps
Alcanderian Nov 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions cmake/deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ if(PPLNN_DEP_HPCC_PKG)
SUBBUILD_DIR ${HPCC_DEPS_DIR}/hpcc-subbuild)
else()
if(NOT PPLNN_DEP_HPCC_GIT)
set(PPLNN_DEP_HPCC_GIT "https://github.com/openppl-public/hpcc.git")
set(PPLNN_DEP_HPCC_GIT "https://github.com/OpenPPL/hpcc.git")
endif()
FetchContent_Declare(hpcc
GIT_REPOSITORY ${PPLNN_DEP_HPCC_GIT}
Expand Down Expand Up @@ -113,7 +113,7 @@ if(PPLNN_DEP_PPLCOMMON_PKG)
${PPLNN_DEP_PPLCOMMON_PKG})
else()
if(NOT PPLNN_DEP_PPLCOMMON_GIT)
set(PPLNN_DEP_PPLCOMMON_GIT "https://github.com/openppl-public/ppl.common.git")
set(PPLNN_DEP_PPLCOMMON_GIT "https://github.com/OpenPPL/ppl.common.git")
endif()
hpcc_declare_git_dep(pplcommon
${PPLNN_DEP_PPLCOMMON_GIT}
Expand Down Expand Up @@ -257,7 +257,7 @@ if(PPLNN_USE_X86_64 OR PPLNN_USE_AARCH64 OR PPLNN_USE_ARMV7 OR PPLNN_USE_RISCV64
${PPLNN_DEP_PPLCPUKERNEL_PKG})
else()
if(NOT PPLNN_DEP_PPLCPUKERNEL_GIT)
set(PPLNN_DEP_PPLCPUKERNEL_GIT "https://github.com/openppl-public/ppl.kernel.cpu.git")
set(PPLNN_DEP_PPLCPUKERNEL_GIT "https://github.com/OpenPPL/ppl.kernel.cpu.git")
endif()
hpcc_declare_git_dep(ppl.kernel.cpu
${PPLNN_DEP_PPLCPUKERNEL_GIT}
Expand All @@ -277,7 +277,7 @@ if(PPLNN_USE_CUDA)
${PPLNN_DEP_PPLCUDAKERNEL_PKG})
else()
if(NOT PPLNN_DEP_PPLCUDAKERNEL_GIT)
set(PPLNN_DEP_PPLCUDAKERNEL_GIT "https://github.com/openppl-public/ppl.kernel.cuda.git")
set(PPLNN_DEP_PPLCUDAKERNEL_GIT "https://github.com/OpenPPL/ppl.kernel.cuda.git")
endif()
hpcc_declare_git_dep(ppl.kernel.cuda
${PPLNN_DEP_PPLCUDAKERNEL_GIT}
Expand All @@ -296,7 +296,7 @@ if(PPLNN_DEP_PPL_LLM_KERNEL_CUDA_PKG)
${PPLNN_DEP_PPL_LLM_KERNEL_CUDA_PKG})
else()
if(NOT PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT)
set(PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT "https://github.com/openppl-public/ppl.llm.kernel.cuda.git")
set(PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT "https://github.com/OpenPPL/ppl.llm.kernel.cuda.git")
endif()
hpcc_declare_git_dep(ppl.llm.kernel.cuda
${PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT}
Expand Down
94 changes: 94 additions & 0 deletions include/ppl/nn/engines/llm_cuda/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,95 @@ enum {
ENGINE_CONF_DEBUG_DATA_DIR = 3,

ENGINE_CONF_MAX,

ENGINE_CONF_INTERNAL_BEGIN = 1000,

/**
@brief uint32_t, set shared memory decoding attention algorithm heuristic(1)/off(0), default is heuristic
This algorithm use sharemem to store softmax logits. And is the fastest algorithm
on decode phase attention, but context length is limited by the size of shared memory

We must set one of `ENGINE_CONF_DECODING_SHM_MHA`, `ENGINE_CONF_DECODING_INF_MHA` and `ENGINE_CONF_DECODING_INF_GQA` on.

@note example:
@code{.cpp}
cuda_engine->Configure(ENGINE_CONF_DECODING_SHM_MHA, uint32_t);
@endcode
*/
ENGINE_CONF_DECODING_SHM_MHA = 1000,

/**
@brief uint32_t, set infinity decoding attention algorithm heuristic(1)/off(0), default is heuristic
This algorithm rescale softmax logits on register. A bit slower than shared memory decoding attention,
but context length has no limit.

We must set one of `ENGINE_CONF_DECODING_SHM_MHA`, `ENGINE_CONF_DECODING_INF_MHA` and `ENGINE_CONF_DECODING_INF_GQA` on.

@note example:
@code{.cpp}
cuda_engine->Configure(ENGINE_CONF_DECODING_INF_MHA, uint32_t);
@endcode
*/
ENGINE_CONF_DECODING_INF_MHA = 1001,

/**
@brief uint32_t, set infinity grouped query decoding attention algorithm heuristic(1)/off(0), default is heuristic
This algorithm rescale softmax logits on shared memory, and optimized by tensor core for grouped query attention.
It could be very fast when decoding batch size is large(usually more than 64). And context length has no limit.

We must set one of `ENGINE_CONF_DECODING_SHM_MHA`, `ENGINE_CONF_DECODING_INF_MHA` and `ENGINE_CONF_DECODING_INF_GQA` on.

@note example:
@code{.cpp}
cuda_engine->Configure(ENGINE_CONF_DECODING_INF_GQA, uint32_t);
@endcode
*/
ENGINE_CONF_DECODING_INF_GQA = 1002,

/**
@brief uint32_t, set split-k decoding attention algorithm always-on(2)/heuristic(1)/off(0), default is heuristic
Apply split-k decoding on all decoding algorithm, accelerating long context decoding.
Recommanded for context length >= 1024, but may slow down when batch size is too large.
And suggest to alway turn it on for context length >= 16k


@note example:
@code{.cpp}
cuda_engine->Configure(ENGINE_CONF_DECODING_SPLIT_K, uint32_t);
@endcode
*/
ENGINE_CONF_DECODING_ATTN_SPLIT_K = 1003,

/**
@brief uint32_t, specify decoding attention kernel threads per block to 512/256/heuristic(0), default is heuristic
Apply split-k decoding on all decoding algorithm, accelerating long context decoding.
Recommanded for context length >= 1024, but may slow down when batch size is too large.
And suggest to alway turn it on for context length >= 16k


@note example:
@code{.cpp}
cuda_engine->Configure(ENGINE_CONF_DECODING_SPLIT_K, uint32_t);
@endcode
*/
ENGINE_CONF_DECODING_ATTN_TPB = 1004,

/**
@brief uint32_t, set prefill flash attention key and value tensor to use int8 kv cache, default is fp16 unquantized
tensor. It is used to support chunked prefill and prompt cache.


@note example:
@code{.cpp}
cuda_engine->Configure(ENGINE_CONF_CACHE_PREFILL, uint32_t);
@endcode
*/
ENGINE_CONF_CACHE_PREFILL = 1005,

// TODO: ENGINE_CONF_CUSTOM_LOGGER = 1006,

ENGINE_CONF_INTERNAL_MAX,

};

/** @brief memory management policies */
Expand All @@ -87,6 +176,9 @@ enum {

/** online quantization, fp16 tensor and int4 weight */
QUANT_METHOD_ONLINE_I4F16,

/** online quantization, fp8 tensor and fp8 weight */
QUANT_METHOD_ONLINE_F8F8,
};

/** @brief cublas layout hint, currently for selecting matrix layout for int8 gemm */
Expand Down Expand Up @@ -125,6 +217,8 @@ enum {
DEV_CONF_MAX,
};



}}}} // namespace ppl::nn::llm::cuda

#endif
71 changes: 65 additions & 6 deletions src/ppl/nn/engines/llm_cuda/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ RetCode LlmCudaEngine::ConfSetTensorParellelNcclComm(LlmCudaEngine* engine, va_l
engine->tensor_parallel_nccl_param_.comm = nccl_comm;
NCCL_CHECK(ncclCommCount(nccl_comm, &engine->tensor_parallel_nccl_param_.size), "ncclCommCount");
NCCL_CHECK(ncclCommUserRank(nccl_comm, &engine->tensor_parallel_nccl_param_.rank), "ncclCommUserRank");
LOG(INFO) << "Engine Conf tp nccl comm world size: "
LOG(INFO) << "Engine Conf tp nccl comm world size: "
<< engine->tensor_parallel_nccl_param_.size;
return RC_SUCCESS;
#else
Expand All @@ -122,6 +122,53 @@ RetCode LlmCudaEngine::ConfDebugDataDir(LlmCudaEngine* engine, va_list args) {
return RC_SUCCESS;
}

RetCode LlmCudaEngine::ConfCachePrefill(LlmCudaEngine* engine, va_list args) {
engine->config_.enable_cache_prefill = va_arg(args, uint32_t) ? true : false;
// TODO: Change to Custom logger
LOG(DEBUG) << "Engine Conf cache prefill: " << engine->config_.enable_cache_prefill;
return RC_SUCCESS;
}

RetCode LlmCudaEngine::ConfDecodingShmMha(LlmCudaEngine* engine, va_list args) {
engine->config_.enable_decoding_sharemem_mhca = va_arg(args, uint32_t) ? true : false;
LOG(INFO) << "Engine Conf decoding shared memory mhca: " << engine->config_.enable_decoding_sharemem_mhca;
return RC_SUCCESS;
}

RetCode LlmCudaEngine::ConfDecodingInfMha(LlmCudaEngine* engine, va_list args) {
engine->config_.enable_decoding_infinity_mhca = va_arg(args, uint32_t) ? true : false;
LOG(INFO) << "Engine Conf decoding infinity mhca: " << engine->config_.enable_decoding_infinity_mhca;
return RC_SUCCESS;
}

RetCode LlmCudaEngine::ConfDecodingInfGqa(LlmCudaEngine* engine, va_list args) {
engine->config_.enable_decoding_infinity_gqca = va_arg(args, uint32_t) ? true : false;
LOG(INFO) << "Engine Conf decoding infinity gqca: " << engine->config_.enable_decoding_infinity_gqca;
return RC_SUCCESS;
}

RetCode LlmCudaEngine::ConfDecodingAttnSplitK(LlmCudaEngine* engine, va_list args) {
uint32_t split_k = va_arg(args, uint32_t);
if (split_k != 0 && split_k != 1 && split_k != 2) {
LOG(ERROR) << "ENGINE_CONF_DECODING_ATTN_SPLIT_K only accept 0/1/2 but get " << split_k;
return ppl::common::RC_INVALID_VALUE;
}
engine->config_.specify_decoding_attn_split_k = split_k;
LOG(INFO) << "Engine Conf decoding attention split k: " << engine->config_.specify_decoding_attn_split_k;
return RC_SUCCESS;
}

RetCode LlmCudaEngine::ConfDecodingAttnTpb(LlmCudaEngine* engine, va_list args) {
uint32_t tpb = va_arg(args, uint32_t);
if (tpb != 0 && tpb != 256 && tpb != 512) {
LOG(ERROR) << "ENGINE_CONF_DECODING_ATTN_TPB only accept 0/256/512 but get " << tpb;
return ppl::common::RC_INVALID_VALUE;
}
engine->config_.specify_decoding_attn_tpb = tpb;
LOG(INFO) << "Engine Conf decoding attention tpb: " << engine->config_.specify_decoding_attn_tpb;
return RC_SUCCESS;
}

#ifdef PPLNN_ENABLE_PMX_MODEL
RetCode LlmCudaEngine::LoadConstants(const ConstantVisitor& visitor, map<edgeid_t, BufferInfo>* eid2info) {
return utils::LoadConstants(visitor, device_.get(), eid2info);
Expand Down Expand Up @@ -156,13 +203,13 @@ ppl::common::RetCode LlmCudaEngine::SerializeData(const pmx::SerializationContex
ppl::common::RetCode LlmCudaEngine::DeserializeData(const void* base, uint64_t size) {
auto fb_engine_param = GetEngineParam(base);
auto fb_param = fb_engine_param->value_as_EngineOptionsParam();

uint32_t cublas_layout_hint = fb_param->cublas_layout_hint();
if (cublas_layout_hint != options_.cublas_layout_hint) {
LOG(WARNING) << "deserialize cublas_layout_hint[" << cublas_layout_hint << "] diff from user input[" << options_.cublas_layout_hint << "]";
}
options_.cublas_layout_hint = cublas_layout_hint;

if (fb_param->version() != GetVersion()) {
LOG(WARNING) << "engine version[" << GetVersion() << "] diff from pmx version[" << fb_param->version() << "]";
}
Expand All @@ -176,16 +223,28 @@ LlmCudaEngine::ConfHandlerFunc LlmCudaEngine::conf_handlers_[] = {
ConfGraphFusion,
ConfTenosrDebug,
ConfDebugDataDir,

ConfDecodingShmMha,
ConfDecodingInfMha,
ConfDecodingInfGqa,
ConfDecodingAttnSplitK,
ConfDecodingAttnTpb,

ConfCachePrefill,
};

RetCode LlmCudaEngine::Configure(uint32_t option, ...) {
if (option >= ENGINE_CONF_MAX) {
LOG(ERROR) << "invalid option[" << option << "] >= [" << (uint32_t)ENGINE_CONF_MAX << "]";
auto conf_length = sizeof(conf_handlers_) / sizeof(ConfHandlerFunc);
auto uniform_option = option >= ENGINE_CONF_INTERNAL_BEGIN ?
option + ENGINE_CONF_MAX - ENGINE_CONF_INTERNAL_BEGIN :
option;
if (uniform_option >= conf_length) {
LOG(ERROR) << "invalid option[" << option << "]";
return RC_INVALID_VALUE;
}
va_list args;
va_start(args, option);
auto status = conf_handlers_[option](this, args);
auto status = conf_handlers_[uniform_option](this, args);
va_end(args);

return status;
Expand Down
11 changes: 10 additions & 1 deletion src/ppl/nn/engines/llm_cuda/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,17 @@ class LlmCudaEngine final : public EngineImpl {
static ppl::common::RetCode ConfTenosrDebug(LlmCudaEngine*, va_list);
static ppl::common::RetCode ConfDebugDataDir(LlmCudaEngine*, va_list);

static ppl::common::RetCode ConfDecodingShmMha(LlmCudaEngine*, va_list);
static ppl::common::RetCode ConfDecodingInfMha(LlmCudaEngine*, va_list);
static ppl::common::RetCode ConfDecodingInfGqa(LlmCudaEngine*, va_list);
static ppl::common::RetCode ConfDecodingAttnSplitK(LlmCudaEngine*, va_list);
static ppl::common::RetCode ConfDecodingAttnTpb(LlmCudaEngine*, va_list);

static ppl::common::RetCode ConfCachePrefill(LlmCudaEngine*, va_list);

typedef ppl::common::RetCode (*ConfHandlerFunc)(LlmCudaEngine*, va_list);
static ConfHandlerFunc conf_handlers_[ENGINE_CONF_MAX];
static ConfHandlerFunc conf_handlers_[
ENGINE_CONF_MAX + (ENGINE_CONF_INTERNAL_MAX - ENGINE_CONF_INTERNAL_BEGIN)];

private:
EngineOptions options_;
Expand Down
7 changes: 7 additions & 0 deletions src/ppl/nn/engines/llm_cuda/engine_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ struct EngineConfig final {
bool enable_graph_fusion = true;
bool enable_tensor_debug = false;
std::string debug_data_dir = ".";

bool enable_cache_prefill = false;
bool enable_decoding_sharemem_mhca = true;
bool enable_decoding_infinity_mhca = true;
bool enable_decoding_infinity_gqca = true;
int32_t specify_decoding_attn_split_k = 1;
int32_t specify_decoding_attn_tpb = 0;
};

}}}} // namespace ppl::nn::llm::cuda
Expand Down
Loading
Loading