OpenPPL · Alcanderian · Nov 7, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
diff --git a/cmake/deps.cmake b/cmake/deps.cmake
@@ -63,7 +63,7 @@ if(PPLNN_DEP_HPCC_PKG)
         SUBBUILD_DIR ${HPCC_DEPS_DIR}/hpcc-subbuild)
 else()
     if(NOT PPLNN_DEP_HPCC_GIT)
-        set(PPLNN_DEP_HPCC_GIT "https://github.com/openppl-public/hpcc.git")
+        set(PPLNN_DEP_HPCC_GIT "https://github.com/OpenPPL/hpcc.git")
     endif()
     FetchContent_Declare(hpcc
         GIT_REPOSITORY ${PPLNN_DEP_HPCC_GIT}
@@ -113,7 +113,7 @@ if(PPLNN_DEP_PPLCOMMON_PKG)
         ${PPLNN_DEP_PPLCOMMON_PKG})
 else()
     if(NOT PPLNN_DEP_PPLCOMMON_GIT)
-        set(PPLNN_DEP_PPLCOMMON_GIT "https://github.com/openppl-public/ppl.common.git")
+        set(PPLNN_DEP_PPLCOMMON_GIT "https://github.com/OpenPPL/ppl.common.git")
     endif()
     hpcc_declare_git_dep(pplcommon
         ${PPLNN_DEP_PPLCOMMON_GIT}
@@ -257,7 +257,7 @@ if(PPLNN_USE_X86_64 OR PPLNN_USE_AARCH64 OR PPLNN_USE_ARMV7 OR PPLNN_USE_RISCV64
             ${PPLNN_DEP_PPLCPUKERNEL_PKG})
     else()
         if(NOT PPLNN_DEP_PPLCPUKERNEL_GIT)
-            set(PPLNN_DEP_PPLCPUKERNEL_GIT "https://github.com/openppl-public/ppl.kernel.cpu.git")
+            set(PPLNN_DEP_PPLCPUKERNEL_GIT "https://github.com/OpenPPL/ppl.kernel.cpu.git")
         endif()
         hpcc_declare_git_dep(ppl.kernel.cpu
             ${PPLNN_DEP_PPLCPUKERNEL_GIT}
@@ -277,7 +277,7 @@ if(PPLNN_USE_CUDA)
             ${PPLNN_DEP_PPLCUDAKERNEL_PKG})
     else()
         if(NOT PPLNN_DEP_PPLCUDAKERNEL_GIT)
-            set(PPLNN_DEP_PPLCUDAKERNEL_GIT "https://github.com/openppl-public/ppl.kernel.cuda.git")
+            set(PPLNN_DEP_PPLCUDAKERNEL_GIT "https://github.com/OpenPPL/ppl.kernel.cuda.git")
         endif()
         hpcc_declare_git_dep(ppl.kernel.cuda
             ${PPLNN_DEP_PPLCUDAKERNEL_GIT}
@@ -296,7 +296,7 @@ if(PPLNN_DEP_PPL_LLM_KERNEL_CUDA_PKG)
         ${PPLNN_DEP_PPL_LLM_KERNEL_CUDA_PKG})
 else()
     if(NOT PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT)
-        set(PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT "https://github.com/openppl-public/ppl.llm.kernel.cuda.git")
+        set(PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT "https://github.com/OpenPPL/ppl.llm.kernel.cuda.git")
     endif()
     hpcc_declare_git_dep(ppl.llm.kernel.cuda
         ${PPLNN_DEP_PPL_LLM_KERNEL_CUDA_GIT}

diff --git a/include/ppl/nn/engines/llm_cuda/options.h b/include/ppl/nn/engines/llm_cuda/options.h
@@ -63,6 +63,95 @@ enum {
     ENGINE_CONF_DEBUG_DATA_DIR = 3,
 
     ENGINE_CONF_MAX,
+
+    ENGINE_CONF_INTERNAL_BEGIN = 1000,
+
+    /**
+       @brief uint32_t, set shared memory decoding attention algorithm heuristic(1)/off(0), default is heuristic
+       This algorithm use sharemem to store softmax logits. And is the fastest algorithm
+       on decode phase attention, but context length is limited by the size of shared memory
+
+       We must set one of `ENGINE_CONF_DECODING_SHM_MHA`, `ENGINE_CONF_DECODING_INF_MHA` and `ENGINE_CONF_DECODING_INF_GQA` on.
+
+       @note example:
+       @code{.cpp}
+       cuda_engine->Configure(ENGINE_CONF_DECODING_SHM_MHA, uint32_t);
+       @endcode
+    */
+    ENGINE_CONF_DECODING_SHM_MHA = 1000,
+
+    /**
+       @brief uint32_t, set infinity decoding attention algorithm heuristic(1)/off(0), default is heuristic
+       This algorithm rescale softmax logits on register. A bit slower than shared memory decoding attention,
+       but context length has no limit.
+
+       We must set one of `ENGINE_CONF_DECODING_SHM_MHA`, `ENGINE_CONF_DECODING_INF_MHA` and `ENGINE_CONF_DECODING_INF_GQA` on.
+
+       @note example:
+       @code{.cpp}
+       cuda_engine->Configure(ENGINE_CONF_DECODING_INF_MHA, uint32_t);
+       @endcode
+    */
+    ENGINE_CONF_DECODING_INF_MHA = 1001,
+
+    /**
+       @brief uint32_t, set infinity grouped query decoding attention algorithm heuristic(1)/off(0), default is heuristic
+       This algorithm rescale softmax logits on shared memory, and optimized by tensor core for grouped query attention.
+       It could be very fast when decoding batch size is large(usually more than 64). And context length has no limit.
+
+       We must set one of `ENGINE_CONF_DECODING_SHM_MHA`, `ENGINE_CONF_DECODING_INF_MHA` and `ENGINE_CONF_DECODING_INF_GQA` on.
+
+       @note example:
+       @code{.cpp}
+       cuda_engine->Configure(ENGINE_CONF_DECODING_INF_GQA, uint32_t);
+       @endcode
+    */
+    ENGINE_CONF_DECODING_INF_GQA = 1002,
+
+    /**
+       @brief uint32_t, set split-k decoding attention algorithm always-on(2)/heuristic(1)/off(0), default is heuristic
+       Apply split-k decoding on all decoding algorithm, accelerating long context decoding.
+       Recommanded for context length >= 1024, but may slow down when batch size is too large.
+       And suggest to alway turn it on for context length >= 16k
+
+
+       @note example:
+       @code{.cpp}
+       cuda_engine->Configure(ENGINE_CONF_DECODING_SPLIT_K, uint32_t);
+       @endcode
+    */
+    ENGINE_CONF_DECODING_ATTN_SPLIT_K = 1003,
+
+    /**
+       @brief uint32_t, specify decoding attention kernel threads per block to 512/256/heuristic(0), default is heuristic
+       Apply split-k decoding on all decoding algorithm, accelerating long context decoding.
+       Recommanded for context length >= 1024, but may slow down when batch size is too large.
+       And suggest to alway turn it on for context length >= 16k
+
+
+       @note example:
+       @code{.cpp}
+       cuda_engine->Configure(ENGINE_CONF_DECODING_SPLIT_K, uint32_t);
+       @endcode
+    */
+    ENGINE_CONF_DECODING_ATTN_TPB = 1004,
+
+   /**
+       @brief uint32_t, set prefill flash attention key and value tensor to use int8 kv cache, default is fp16 unquantized
+       tensor. It is used to support chunked prefill and prompt cache.
+
+
+       @note example:
+       @code{.cpp}
+       cuda_engine->Configure(ENGINE_CONF_CACHE_PREFILL, uint32_t);
+       @endcode
+    */
+    ENGINE_CONF_CACHE_PREFILL = 1005,
+
+   // TODO: ENGINE_CONF_CUSTOM_LOGGER = 1006,
+
+    ENGINE_CONF_INTERNAL_MAX,
+
 };
 
 /** @brief memory management policies */
@@ -87,6 +176,9 @@ enum {
 
     /** online quantization, fp16 tensor and int4 weight */
     QUANT_METHOD_ONLINE_I4F16,
+
+    /** online quantization, fp8 tensor and fp8 weight */
+    QUANT_METHOD_ONLINE_F8F8,
 };
 
 /** @brief cublas layout hint, currently for selecting matrix layout for int8 gemm */
@@ -125,6 +217,8 @@ enum {
     DEV_CONF_MAX,
 };
 
+
+
 }}}} // namespace ppl::nn::llm::cuda
 
 #endif
diff --git a/src/ppl/nn/engines/llm_cuda/engine.cc b/src/ppl/nn/engines/llm_cuda/engine.cc
@@ -95,7 +95,7 @@ RetCode LlmCudaEngine::ConfSetTensorParellelNcclComm(LlmCudaEngine* engine, va_l
     engine->tensor_parallel_nccl_param_.comm = nccl_comm;
     NCCL_CHECK(ncclCommCount(nccl_comm, &engine->tensor_parallel_nccl_param_.size), "ncclCommCount");
     NCCL_CHECK(ncclCommUserRank(nccl_comm, &engine->tensor_parallel_nccl_param_.rank), "ncclCommUserRank");
-    LOG(INFO) << "Engine Conf tp nccl comm world size: " 
+    LOG(INFO) << "Engine Conf tp nccl comm world size: "
         << engine->tensor_parallel_nccl_param_.size;
     return RC_SUCCESS;
 #else
@@ -122,6 +122,53 @@ RetCode LlmCudaEngine::ConfDebugDataDir(LlmCudaEngine* engine, va_list args) {
     return RC_SUCCESS;
 }
 
+RetCode LlmCudaEngine::ConfCachePrefill(LlmCudaEngine* engine, va_list args) {
+    engine->config_.enable_cache_prefill = va_arg(args, uint32_t) ? true : false;
+    // TODO: Change to Custom logger
+    LOG(DEBUG) << "Engine Conf cache prefill: " << engine->config_.enable_cache_prefill;
+    return RC_SUCCESS;
+}
+
+RetCode LlmCudaEngine::ConfDecodingShmMha(LlmCudaEngine* engine, va_list args) {
+    engine->config_.enable_decoding_sharemem_mhca = va_arg(args, uint32_t) ? true : false;
+    LOG(INFO) << "Engine Conf decoding shared memory mhca: " << engine->config_.enable_decoding_sharemem_mhca;
+    return RC_SUCCESS;
+}
+
+RetCode LlmCudaEngine::ConfDecodingInfMha(LlmCudaEngine* engine, va_list args) {
+    engine->config_.enable_decoding_infinity_mhca = va_arg(args, uint32_t) ? true : false;
+    LOG(INFO) << "Engine Conf decoding infinity mhca: " << engine->config_.enable_decoding_infinity_mhca;
+    return RC_SUCCESS;
+}
+
+RetCode LlmCudaEngine::ConfDecodingInfGqa(LlmCudaEngine* engine, va_list args) {
+    engine->config_.enable_decoding_infinity_gqca = va_arg(args, uint32_t) ? true : false;
+    LOG(INFO) << "Engine Conf decoding infinity gqca: " << engine->config_.enable_decoding_infinity_gqca;
+    return RC_SUCCESS;
+}
+
+RetCode LlmCudaEngine::ConfDecodingAttnSplitK(LlmCudaEngine* engine, va_list args) {
+    uint32_t split_k = va_arg(args, uint32_t);
+    if (split_k != 0 && split_k != 1 && split_k != 2) {
+        LOG(ERROR) << "ENGINE_CONF_DECODING_ATTN_SPLIT_K only accept 0/1/2 but get " << split_k;
+        return ppl::common::RC_INVALID_VALUE;
+    }
+    engine->config_.specify_decoding_attn_split_k = split_k;
+    LOG(INFO) << "Engine Conf decoding attention split k: " << engine->config_.specify_decoding_attn_split_k;
+    return RC_SUCCESS;
+}
+
+RetCode LlmCudaEngine::ConfDecodingAttnTpb(LlmCudaEngine* engine, va_list args) {
+    uint32_t tpb = va_arg(args, uint32_t);
+    if (tpb != 0 && tpb != 256 && tpb != 512) {
+        LOG(ERROR) << "ENGINE_CONF_DECODING_ATTN_TPB only accept 0/256/512 but get " << tpb;
+        return ppl::common::RC_INVALID_VALUE;
+    }
+    engine->config_.specify_decoding_attn_tpb = tpb;
+    LOG(INFO) << "Engine Conf decoding attention tpb: " << engine->config_.specify_decoding_attn_tpb;
+    return RC_SUCCESS;
+}
+
 #ifdef PPLNN_ENABLE_PMX_MODEL
 RetCode LlmCudaEngine::LoadConstants(const ConstantVisitor& visitor, map<edgeid_t, BufferInfo>* eid2info) {
     return utils::LoadConstants(visitor, device_.get(), eid2info);
@@ -156,13 +203,13 @@ ppl::common::RetCode LlmCudaEngine::SerializeData(const pmx::SerializationContex
 ppl::common::RetCode LlmCudaEngine::DeserializeData(const void* base, uint64_t size) {
     auto fb_engine_param = GetEngineParam(base);
     auto fb_param = fb_engine_param->value_as_EngineOptionsParam();
-    
+
     uint32_t cublas_layout_hint = fb_param->cublas_layout_hint();
     if (cublas_layout_hint != options_.cublas_layout_hint) {
         LOG(WARNING) << "deserialize cublas_layout_hint[" << cublas_layout_hint << "] diff from user input[" <<  options_.cublas_layout_hint << "]";
     }
     options_.cublas_layout_hint = cublas_layout_hint;
-    
+
     if (fb_param->version() != GetVersion()) {
         LOG(WARNING) << "engine version[" << GetVersion() << "] diff from pmx version[" <<  fb_param->version() << "]";
     }
@@ -176,16 +223,28 @@ LlmCudaEngine::ConfHandlerFunc LlmCudaEngine::conf_handlers_[] = {
     ConfGraphFusion,
     ConfTenosrDebug,
     ConfDebugDataDir,
+
+    ConfDecodingShmMha,
+    ConfDecodingInfMha,
+    ConfDecodingInfGqa,
+    ConfDecodingAttnSplitK,
+    ConfDecodingAttnTpb,
+
+    ConfCachePrefill,
 };
 
 RetCode LlmCudaEngine::Configure(uint32_t option, ...) {
-    if (option >= ENGINE_CONF_MAX) {
-        LOG(ERROR) << "invalid option[" << option << "] >= [" << (uint32_t)ENGINE_CONF_MAX << "]";
+    auto conf_length = sizeof(conf_handlers_) / sizeof(ConfHandlerFunc);
+    auto uniform_option = option >= ENGINE_CONF_INTERNAL_BEGIN ?
+        option + ENGINE_CONF_MAX - ENGINE_CONF_INTERNAL_BEGIN :
+        option;
+    if (uniform_option >= conf_length) {
+        LOG(ERROR) << "invalid option[" << option << "]";
         return RC_INVALID_VALUE;
     }
     va_list args;
     va_start(args, option);
-    auto status = conf_handlers_[option](this, args);
+    auto status = conf_handlers_[uniform_option](this, args);
     va_end(args);
 
     return status;

diff --git a/src/ppl/nn/engines/llm_cuda/engine.h b/src/ppl/nn/engines/llm_cuda/engine.h
@@ -54,8 +54,17 @@ class LlmCudaEngine final : public EngineImpl {
     static ppl::common::RetCode ConfTenosrDebug(LlmCudaEngine*, va_list);
     static ppl::common::RetCode ConfDebugDataDir(LlmCudaEngine*, va_list);
 
+    static ppl::common::RetCode ConfDecodingShmMha(LlmCudaEngine*, va_list);
+    static ppl::common::RetCode ConfDecodingInfMha(LlmCudaEngine*, va_list);
+    static ppl::common::RetCode ConfDecodingInfGqa(LlmCudaEngine*, va_list);
+    static ppl::common::RetCode ConfDecodingAttnSplitK(LlmCudaEngine*, va_list);
+    static ppl::common::RetCode ConfDecodingAttnTpb(LlmCudaEngine*, va_list);
+
+    static ppl::common::RetCode ConfCachePrefill(LlmCudaEngine*, va_list);
+
     typedef ppl::common::RetCode (*ConfHandlerFunc)(LlmCudaEngine*, va_list);
-    static ConfHandlerFunc conf_handlers_[ENGINE_CONF_MAX];
+    static ConfHandlerFunc conf_handlers_[
+        ENGINE_CONF_MAX + (ENGINE_CONF_INTERNAL_MAX - ENGINE_CONF_INTERNAL_BEGIN)];
 
 private:
     EngineOptions options_;

diff --git a/src/ppl/nn/engines/llm_cuda/engine_config.h b/src/ppl/nn/engines/llm_cuda/engine_config.h
@@ -26,6 +26,13 @@ struct EngineConfig final {
     bool enable_graph_fusion = true;
     bool enable_tensor_debug = false;
     std::string debug_data_dir = ".";
+
+    bool enable_cache_prefill = false;
+    bool enable_decoding_sharemem_mhca = true;
+    bool enable_decoding_infinity_mhca = true;
+    bool enable_decoding_infinity_gqca = true;
+    int32_t specify_decoding_attn_split_k = 1;
+    int32_t specify_decoding_attn_tpb = 0;
 };
 
 }}}} // namespace ppl::nn::llm::cuda