diff --git a/custom_ops/gpu_ops/beam_search_softmax.cu b/custom_ops/gpu_ops/beam_search_softmax.cu index 59500dbe33f..d7ee12d5c84 100644 --- a/custom_ops/gpu_ops/beam_search_softmax.cu +++ b/custom_ops/gpu_ops/beam_search_softmax.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #ifdef __NVCC__ #include #endif @@ -19,14 +20,16 @@ #include namespace cub = hipcub; #endif -#include #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include #include "helper.h" #include "stdint.h" diff --git a/custom_ops/gpu_ops/custom_ftok.h b/custom_ops/gpu_ops/custom_ftok.h index 302061baf63..d92733db912 100644 --- a/custom_ops/gpu_ops/custom_ftok.h +++ b/custom_ops/gpu_ops/custom_ftok.h @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once +#ifndef _WIN32 #include #include #include @@ -35,3 +37,4 @@ inline key_t custom_ftok(const char* path, int id) { return static_cast(((st.st_dev & 0x0f) << 28) | ((st.st_ino & 0xff) << 20) | (id & 0xfffff)); } +#endif diff --git a/custom_ops/gpu_ops/dequant_int8.cu b/custom_ops/gpu_ops/dequant_int8.cu index 2b5f1b92d97..ffad6c8df8a 100644 --- a/custom_ops/gpu_ops/dequant_int8.cu +++ b/custom_ops/gpu_ops/dequant_int8.cu @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +// Hackathon 10th Spring No.46 — compilation guards #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include #include "helper.h" diff --git a/custom_ops/gpu_ops/env.h b/custom_ops/gpu_ops/env.h index eed65bea2c9..8240e37dfd5 100644 --- a/custom_ops/gpu_ops/env.h +++ b/custom_ops/gpu_ops/env.h @@ -12,8 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once +#include +#include + inline uint32_t get_decoder_block_shape_q() { static const char* decoder_block_shape_q_env = std::getenv("FLAGS_dec_block_shape_q"); diff --git a/custom_ops/gpu_ops/fused_get_rotary_embedding.cu b/custom_ops/gpu_ops/fused_get_rotary_embedding.cu index 6d5f50f832a..0d6b1d6d6a0 100644 --- a/custom_ops/gpu_ops/fused_get_rotary_embedding.cu +++ b/custom_ops/gpu_ops/fused_get_rotary_embedding.cu @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +// Hackathon 10th Spring No.46 — compilation guards #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include #include "paddle/extension.h" diff --git a/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu b/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu index 0e78a040613..02013df9c2b 100644 --- a/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu +++ b/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +// Hackathon 10th Spring No.46 — compilation guards #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include #include "helper.h" diff --git a/custom_ops/gpu_ops/get_data_ptr_ipc.cu b/custom_ops/gpu_ops/get_data_ptr_ipc.cu index b866785429e..6d6fb9c4d26 100644 --- a/custom_ops/gpu_ops/get_data_ptr_ipc.cu +++ b/custom_ops/gpu_ops/get_data_ptr_ipc.cu @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include "cuda_multiprocess.h" #include "helper.h" namespace { +#ifndef _WIN32 int sharedMemoryOpen2(const char *name, size_t sz, sharedMemoryInfo *info) { info->size = sz; info->shmFd = shm_open(name, O_RDWR, 0777); @@ -31,10 +33,16 @@ int sharedMemoryOpen2(const char *name, size_t sz, sharedMemoryInfo *info) { return 0; } +#endif } // namespace std::vector GetDataPtrIpc(const paddle::Tensor &tmp_input, const std::string &shm_name) { +#ifdef _WIN32 + PD_THROW( + "GetDataPtrIpc is not supported on Windows " + "(POSIX shared memory required)."); +#else auto out_data_ptr_tensor = paddle::full({1}, 0, paddle::DataType::INT64, paddle::CPUPlace()); auto out_data_ptr_tensor_ptr = out_data_ptr_tensor.data(); @@ -53,6 +61,7 @@ std::vector GetDataPtrIpc(const paddle::Tensor &tmp_input, out_data_ptr_tensor_ptr[0] = reinterpret_cast(ptr); return {out_data_ptr_tensor}; +#endif } PD_BUILD_STATIC_OP(get_data_ptr_ipc) diff --git a/custom_ops/gpu_ops/get_output.cc b/custom_ops/gpu_ops/get_output.cc index 4714315d17c..773955b2bd2 100644 --- a/custom_ops/gpu_ops/get_output.cc +++ b/custom_ops/gpu_ops/get_output.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "custom_ftok.h" #include "paddle/extension.h" @@ -36,6 +39,9 @@ void GetOutput(const paddle::Tensor& x, int64_t rank_id, bool wait_flag, int msg_queue_id) { +#ifdef _WIN32 + PD_THROW("GetOutput is not supported on Windows (POSIX IPC required)."); +#else if (rank_id > 0) { return; } @@ -81,6 +87,7 @@ void GetOutput(const paddle::Tensor& x, #endif return; +#endif } void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) { diff --git a/custom_ops/gpu_ops/get_output_ep.cc b/custom_ops/gpu_ops/get_output_ep.cc index 2b5f7859976..7941dc4b033 100644 --- a/custom_ops/gpu_ops/get_output_ep.cc +++ b/custom_ops/gpu_ops/get_output_ep.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "custom_ftok.h" #include "msg_utils.h" #include "paddle/extension.h" @@ -29,6 +32,10 @@ void GetOutputKVSignal(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) { +#ifdef _WIN32 + PD_THROW( + "GetOutputKVSignal is not supported on Windows (POSIX IPC required)."); +#else int msg_queue_id = 1024; if (const char* msg_que_str_tmp = std::getenv("INFERENCE_MSG_QUEUE_ID")) { std::string msg_que_str(msg_que_str_tmp); @@ -57,12 +64,16 @@ void GetOutputKVSignal(const paddle::Tensor& x, out_data[i] = msg_rcv.mtext[i]; } return; +#endif } void GetOutputEp(const paddle::Tensor& x, int64_t rank_id, bool wait_flag, int msg_queue_id) { +#ifdef _WIN32 + PD_THROW("GetOutputEp is not supported on Windows (POSIX IPC required)."); +#else static struct msgdata msg_rcv; if (const char* inference_msg_queue_id_env_p = std::getenv("INFERENCE_MSG_QUEUE_ID")) { @@ -108,6 +119,7 @@ void GetOutputEp(const paddle::Tensor& x, #endif return; +#endif } void GetOutputEPStatic(const paddle::Tensor& x, diff --git a/custom_ops/gpu_ops/get_output_msg_with_topk.cc b/custom_ops/gpu_ops/get_output_msg_with_topk.cc index e70f7c2c24d..a5bc745dfc8 100644 --- a/custom_ops/gpu_ops/get_output_msg_with_topk.cc +++ b/custom_ops/gpu_ops/get_output_msg_with_topk.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "custom_ftok.h" #include "paddle/extension.h" @@ -40,6 +43,9 @@ void GetOutputTopK(const paddle::Tensor& x, int k, int64_t rank_id, bool wait_flag) { +#ifdef _WIN32 + PD_THROW("GetOutputTopK is not supported on Windows (POSIX IPC required)."); +#else static struct msgdata msg_rcv; int msg_queue_id = 1; @@ -101,6 +107,7 @@ void GetOutputTopK(const paddle::Tensor& x, ranks_data[i] = (int64_t)msg_rcv.mtext_ranks[i]; } return; +#endif } PD_BUILD_STATIC_OP(get_output_topk) diff --git a/custom_ops/gpu_ops/helper.h b/custom_ops/gpu_ops/helper.h index 83f3ad1077d..b0a044285af 100644 --- a/custom_ops/gpu_ops/helper.h +++ b/custom_ops/gpu_ops/helper.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once #include @@ -19,15 +20,17 @@ #ifndef PADDLE_WITH_COREX #include "glog/logging.h" #endif -#include #include #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include #include #include diff --git a/custom_ops/gpu_ops/msg_utils.h b/custom_ops/gpu_ops/msg_utils.h index 9976ee1be4a..892cace8580 100644 --- a/custom_ops/gpu_ops/msg_utils.h +++ b/custom_ops/gpu_ops/msg_utils.h @@ -12,18 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once -#include #include #include #include -#include #include #include +#ifndef _WIN32 +#include #include +#include #include #include +#endif #include "paddle/extension.h" #define MAX_BSZ 512 diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.cc b/custom_ops/gpu_ops/remote_cache_kv_ipc.cc index 2c34d49b747..2193f3c523d 100644 --- a/custom_ops/gpu_ops/remote_cache_kv_ipc.cc +++ b/custom_ops/gpu_ops/remote_cache_kv_ipc.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include "remote_cache_kv_ipc.h" RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data @@ -24,6 +25,11 @@ bool RemoteCacheKvIpc::kv_complete_signal_shmem_opened = false; RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data RemoteCacheKvIpc::open_shm_and_get_complete_signal_meta_data( const int rank_id, const int device_id, const bool keep_pd_step_flag) { +#ifdef _WIN32 + PD_THROW( + "open_shm_and_get_complete_signal_meta_data is not supported on " + "Windows (POSIX shared memory required)."); +#else if (RemoteCacheKvIpc::kv_complete_signal_shmem_opened) { if (keep_pd_step_flag) { return RemoteCacheKvIpc::kv_complete_signal_meta_data; @@ -103,6 +109,7 @@ RemoteCacheKvIpc::open_shm_and_get_complete_signal_meta_data( RemoteCacheKvIpc::kv_complete_signal_identity_ptr = identity_ptr; RemoteCacheKvIpc::kv_complete_signal_shmem_opened = true; return meta_data; +#endif } void CUDART_CB diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.h b/custom_ops/gpu_ops/remote_cache_kv_ipc.h index 542957c45c2..b62f5ac1524 100644 --- a/custom_ops/gpu_ops/remote_cache_kv_ipc.h +++ b/custom_ops/gpu_ops/remote_cache_kv_ipc.h @@ -12,18 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once -#include #include #include #include +#include +#include +#ifndef _WIN32 +#include #include #include #include -#include -#include #include +#endif #include "custom_ftok.h" #include "driver_types.h" @@ -58,6 +61,11 @@ struct RemoteCacheKvIpc { const int rank, const int num_layers, const int real_bsz) { +#ifdef _WIN32 + PD_THROW( + "RemoteCacheKvIpc::init is not supported on Windows " + "(POSIX IPC required)."); +#else layer_id_ = 0; num_layers_ = num_layers; msg_sed.mtype = 1; @@ -85,9 +93,15 @@ struct RemoteCacheKvIpc { msgid = msgget(key, IPC_CREAT | 0666); inited = true; } +#endif } void CUDART_CB send_signal() { +#ifdef _WIN32 + PD_THROW( + "RemoteCacheKvIpc::send_signal is not supported on Windows " + "(POSIX IPC required)."); +#else if (inited) { msg_sed.mtext[1] = layer_id_; if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) { @@ -96,6 +110,7 @@ struct RemoteCacheKvIpc { layer_id_ = (layer_id_ + 1); assert(layer_id_ <= num_layers_); } +#endif } }; diff --git a/custom_ops/gpu_ops/save_output_msg_with_topk.cc b/custom_ops/gpu_ops/save_output_msg_with_topk.cc index 0a7d2ab6eac..189c0dd6934 100644 --- a/custom_ops/gpu_ops/save_output_msg_with_topk.cc +++ b/custom_ops/gpu_ops/save_output_msg_with_topk.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "custom_ftok.h" #include "paddle/extension.h" @@ -42,6 +45,9 @@ void SaveOutMmsgTopK(const paddle::Tensor& x, const paddle::Tensor& not_need_stop, const paddle::Tensor& preempted_idx, int64_t rank_id) { +#ifdef _WIN32 + PD_THROW("SaveOutMmsgTopK is not supported on Windows (POSIX IPC required)."); +#else if (rank_id > 0) { return; } @@ -145,6 +151,7 @@ void SaveOutMmsgTopK(const paddle::Tensor& x, printf("full msg buffer\n"); } return; +#endif } PD_BUILD_STATIC_OP(save_output_topk) diff --git a/custom_ops/gpu_ops/save_with_output_msg.cc b/custom_ops/gpu_ops/save_with_output_msg.cc index bc1d847c3fe..bf2b7c85a8e 100644 --- a/custom_ops/gpu_ops/save_with_output_msg.cc +++ b/custom_ops/gpu_ops/save_with_output_msg.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include "save_with_output_msg.h" #include "custom_ftok.h" @@ -21,6 +22,9 @@ void save_kernel(const paddle::Tensor& x, int64_t rank_id, int msg_queue_id, bool save_each_rank) { +#ifdef _WIN32 + PD_THROW("save_kernel is not supported on Windows (POSIX IPC required)."); +#else const int64_t* x_data = x.data(); static struct msgdata msg_sed; const int32_t* preempted_idx_data = preempted_idx.data(); @@ -100,6 +104,7 @@ void save_kernel(const paddle::Tensor& x, printf("full msg buffer\n"); } return; +#endif } void SaveOutMmsg(const paddle::Tensor& x, diff --git a/custom_ops/gpu_ops/save_with_output_msg.h b/custom_ops/gpu_ops/save_with_output_msg.h index 98b6f94b7a8..db635c3918f 100644 --- a/custom_ops/gpu_ops/save_with_output_msg.h +++ b/custom_ops/gpu_ops/save_with_output_msg.h @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "paddle/extension.h" #ifndef PD_BUILD_STATIC_OP diff --git a/custom_ops/gpu_ops/share_external_data.cu b/custom_ops/gpu_ops/share_external_data.cu index 5fd6df58f45..2981f9851b1 100644 --- a/custom_ops/gpu_ops/share_external_data.cu +++ b/custom_ops/gpu_ops/share_external_data.cu @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards +#include #include #include -#include #include -#include +#include +#ifndef _WIN32 #include #include -#include +#include +#endif #include "cuda_multiprocess.h" #include "helper.h" #include "paddle/phi/core/tensor_meta.h" diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/mtp_save_first_token.cc b/custom_ops/gpu_ops/speculate_decoding/draft_model/mtp_save_first_token.cc index 7b78fae7c26..9809c74edec 100644 --- a/custom_ops/gpu_ops/speculate_decoding/draft_model/mtp_save_first_token.cc +++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/mtp_save_first_token.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "../speculate_msg.h" #include "../../custom_ftok.h" #include "paddle/extension.h" @@ -36,6 +39,11 @@ void MTPSaveFirstToken(const paddle::Tensor& x, int msg_queue_id, bool save_each_rank, bool skip_chunk_prefill) { +#ifdef _WIN32 + PD_THROW( + "MTPSaveFirstToken is not supported on Windows " + "(POSIX IPC required)."); +#else if (!save_each_rank && rank_id > 0) { return; } @@ -155,6 +163,7 @@ void MTPSaveFirstToken(const paddle::Tensor& x, printf("full msg buffer\n"); } return; +#endif } void MTPSaveFirstTokenStatic(const paddle::Tensor& x, diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output.cc index 1a451db350e..cc10a4c381e 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "paddle/extension.h" #include "../custom_ftok.h" @@ -31,6 +34,10 @@ void SpeculateGetOutput(const paddle::Tensor& x, bool wait_flag, int msg_queue_id, bool get_each_rank) { +#ifdef _WIN32 + PD_THROW( + "SpeculateGetOutput is not supported on Windows (POSIX IPC required)."); +#else if (!get_each_rank && rank_id > 0) { return; } @@ -76,6 +83,7 @@ void SpeculateGetOutput(const paddle::Tensor& x, out_data[i] = (int64_t)msg_rcv.mtext[i]; } return; +#endif } void SpeculateGetOutputStatic(const paddle::Tensor& x, diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc index 4fd7d4103c4..b1bad3655f6 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc @@ -12,25 +12,48 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "paddle/extension.h" #include "../custom_ftok.h" -#include "speculate_logprob_msg.h" #ifndef PD_BUILD_STATIC_OP #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif +#define MAX_BSZ 512 +#define K 20 +#define MAX_DRAFT_TOKEN_NUM 6 + +struct batch_msgdata { + int tokens[MAX_DRAFT_TOKEN_NUM * (K + 1)]; + float scores[MAX_DRAFT_TOKEN_NUM * (K + 1)]; + int ranks[MAX_DRAFT_TOKEN_NUM]; +}; + +struct msgdata { + long mtype; + int meta[3 + MAX_BSZ]; // stop_flag, message_flag, bsz, batch_token_nums + batch_msgdata mtext[MAX_BSZ]; +}; + void SpeculateGetOutMmsgTopK(const paddle::Tensor& output_tokens, const paddle::Tensor& output_scores, const paddle::Tensor& output_ranks, int real_k, int64_t rank_id, bool wait_flag) { +#ifdef _WIN32 + PD_THROW( + "SpeculateGetOutMmsgTopK is not supported on Windows " + "(POSIX IPC required)."); +#else struct msgdata msg_rcv; int msg_queue_id = 1; @@ -78,22 +101,22 @@ void SpeculateGetOutMmsgTopK(const paddle::Tensor& output_tokens, output_tokens_data[1] = (int64_t)msg_rcv.meta[1]; output_tokens_data[2] = (int64_t)msg_rcv.meta[2]; - int output_tokens_offset = 3 + SPEC_LOGPROB_MAX_BSZ; + int output_tokens_offset = 3 + MAX_BSZ; for (int i = 0; i < bsz; i++) { int cur_token_num = msg_rcv.meta[3 + i]; output_tokens_data[3 + i] = (int64_t)cur_token_num; // batch_token_nums auto* cur_output_token = output_tokens_data + output_tokens_offset + - i * (MAX_DRAFT_TOKEN_NUM * (SPEC_LOGPROB_K + 1)); + i * (MAX_DRAFT_TOKEN_NUM * (K + 1)); auto* cur_output_score = - output_scores_data + i * (MAX_DRAFT_TOKEN_NUM * (SPEC_LOGPROB_K + 1)); + output_scores_data + i * (MAX_DRAFT_TOKEN_NUM * (K + 1)); auto* cur_batch_msg_rcv = &msg_rcv.mtext[i]; for (int j = 0; j < cur_token_num; j++) { for (int k = 0; k < real_k + 1; k++) { - cur_output_token[j * (SPEC_LOGPROB_K + 1) + k] = - (int64_t)cur_batch_msg_rcv->tokens[j * (SPEC_LOGPROB_K + 1) + k]; - cur_output_score[j * (SPEC_LOGPROB_K + 1) + k] = - cur_batch_msg_rcv->scores[j * (SPEC_LOGPROB_K + 1) + k]; + cur_output_token[j * (K + 1) + k] = + (int64_t)cur_batch_msg_rcv->tokens[j * (K + 1) + k]; + cur_output_score[j * (K + 1) + k] = + cur_batch_msg_rcv->scores[j * (K + 1) + k]; } output_ranks_data[i * MAX_DRAFT_TOKEN_NUM + j] = (int64_t)cur_batch_msg_rcv->ranks[j]; @@ -109,19 +132,17 @@ void SpeculateGetOutMmsgTopK(const paddle::Tensor& output_tokens, std::cout << "batch " << i << " token_num: " << cur_token_num << std::endl; for (int j = 0; j < cur_token_num; j++) { std::cout << "tokens: "; - for (int k = 0; k < SPEC_LOGPROB_K + 1; k++) { + for (int k = 0; k < K + 1; k++) { std::cout << output_tokens_data[output_tokens_offset + - i * MAX_DRAFT_TOKEN_NUM * - (SPEC_LOGPROB_K + 1) + - j * (SPEC_LOGPROB_K + 1) + k] + i * MAX_DRAFT_TOKEN_NUM * (K + 1) + + j * (K + 1) + k] << " "; } std::cout << std::endl; std::cout << "scores: "; - for (int k = 0; k < SPEC_LOGPROB_K + 1; k++) { - std::cout << output_scores_data[i * MAX_DRAFT_TOKEN_NUM * - (SPEC_LOGPROB_K + 1) + - j * (SPEC_LOGPROB_K + 1) + k] + for (int k = 0; k < K + 1; k++) { + std::cout << output_scores_data[i * MAX_DRAFT_TOKEN_NUM * (K + 1) + + j * (K + 1) + k] << " "; } std::cout << std::endl; @@ -132,6 +153,7 @@ void SpeculateGetOutMmsgTopK(const paddle::Tensor& output_tokens, std::cout << std::endl; #endif return; +#endif } PD_BUILD_STATIC_OP(speculate_get_output_topk) diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_msg.h b/custom_ops/gpu_ops/speculate_decoding/speculate_msg.h index a212bb9c1bf..82e34bf7e21 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_msg.h +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_msg.h @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #pragma once #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "paddle/extension.h" #define MAX_BSZ 256 diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc index f72f3774107..f634b3a33fe 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "paddle/extension.h" #include "../custom_ftok.h" @@ -36,6 +39,11 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens, int msg_queue_id, int save_each_rank, bool skip_prefill) { +#ifdef _WIN32 + PD_THROW( + "SpeculateSaveWithOutputMsg is not supported on Windows " + "(POSIX IPC required)."); +#else // NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling // outputs, so only rank 0 needs to send results to the message queue. if (rank_id > 0) { @@ -134,6 +142,7 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens, printf("full msg buffer\n"); } return; +#endif } void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens, diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc index 0b3de384cee..352a90aef4b 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc @@ -12,19 +12,37 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include #include +#include +#ifndef _WIN32 #include #include -#include +#endif #include "paddle/extension.h" #include "../custom_ftok.h" -#include "speculate_logprob_msg.h" #ifndef PD_BUILD_STATIC_OP #define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name) #endif +#define MAX_BSZ 512 +#define K 20 +#define MAX_DRAFT_TOKEN_NUM 6 + +struct batch_msgdata { + int tokens[MAX_DRAFT_TOKEN_NUM * (K + 1)]; + float scores[MAX_DRAFT_TOKEN_NUM * (K + 1)]; + int ranks[MAX_DRAFT_TOKEN_NUM]; +}; + +struct msgdata { + long mtype; + int meta[3 + MAX_BSZ]; // stop_flag, message_flag, bsz, batch_token_nums + batch_msgdata mtext[MAX_BSZ]; +}; + void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, const paddle::Tensor& logprob_token_ids, const paddle::Tensor& logprob_scores, @@ -38,6 +56,11 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, int message_flag, // Target: 3, Draft: 4 int64_t rank_id, bool save_each_rank) { +#ifdef _WIN32 + PD_THROW( + "SpeculateSaveOutMmsgTopK is not supported on Windows " + "(POSIX IPC required)."); +#else // NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling // outputs, so only rank 0 needs to send results to the message queue. if (rank_id > 0) { @@ -139,21 +162,16 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, auto* cur_batch_msg_sed = &msg_sed.mtext[i]; int token_offset = cu_batch_token_offset_data[i]; for (int j = 0; j < cur_token_num; j++) { - auto* cur_tokens = &cur_batch_msg_sed->tokens[j * (SPEC_LOGPROB_K + 1)]; - auto* cur_scores = &cur_batch_msg_sed->scores[j * (SPEC_LOGPROB_K + 1)]; - for (int k = 0; k < SPEC_LOGPROB_K + 1; k++) { + auto* cur_tokens = &cur_batch_msg_sed->tokens[j * (K + 1)]; + auto* cur_scores = &cur_batch_msg_sed->scores[j * (K + 1)]; + for (int k = 0; k < K + 1; k++) { if (k == 0) { cur_tokens[k] = (int)sampled_token_ids_data[i * max_draft_tokens + j]; - cur_scores[k] = - logprob_scores_data[(token_offset + j) * (SPEC_LOGPROB_K + 1) + - k]; + cur_scores[k] = logprob_scores_data[(token_offset + j) * (K + 1) + k]; } else if (k < max_num_logprobs) { - cur_tokens[k] = (int) - logprob_token_ids_data[(token_offset + j) * (SPEC_LOGPROB_K + 1) + - k]; - cur_scores[k] = - logprob_scores_data[(token_offset + j) * (SPEC_LOGPROB_K + 1) + - k]; + cur_tokens[k] = + (int)logprob_token_ids_data[(token_offset + j) * (K + 1) + k]; + cur_scores[k] = logprob_scores_data[(token_offset + j) * (K + 1) + k]; } else { cur_tokens[k] = -1; cur_scores[k] = 0.0; @@ -172,15 +190,15 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, auto* cur_batch_msg_sed = &msg_sed.mtext[i]; std::cout << "batch " << i << " token_num: " << cur_token_num << std::endl; for (int j = 0; j < cur_token_num; j++) { - auto* cur_tokens = &cur_batch_msg_sed->tokens[j * (SPEC_LOGPROB_K + 1)]; - auto* cur_scores = &cur_batch_msg_sed->scores[j * (SPEC_LOGPROB_K + 1)]; + auto* cur_tokens = &cur_batch_msg_sed->tokens[j * (K + 1)]; + auto* cur_scores = &cur_batch_msg_sed->scores[j * (K + 1)]; std::cout << "tokens: "; - for (int k = 0; k < SPEC_LOGPROB_K + 1; k++) { + for (int k = 0; k < K + 1; k++) { std::cout << cur_tokens[k] << " "; } std::cout << std::endl; std::cout << "scores: "; - for (int k = 0; k < SPEC_LOGPROB_K + 1; k++) { + for (int k = 0; k < K + 1; k++) { std::cout << cur_scores[k] << " "; } std::cout << std::endl; @@ -192,6 +210,7 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, if (msgsnd(msgid, &msg_sed, sizeof(msg_sed) - sizeof(long), 0) == -1) { printf("full msg buffer\n"); } +#endif } PD_BUILD_STATIC_OP(speculate_save_output_topk) diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu index f2a8cd6f11f..3916aeaa139 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include "../custom_ftok.h" #include "helper.h" #include "speculate_msg.h" @@ -222,6 +223,11 @@ void SpeculateStepSchedule( const int block_size, const int encoder_decoder_block_num, const int max_draft_tokens) { +#ifdef _WIN32 + PD_THROW( + "SpeculateStepSchedule is not supported on Windows " + "(POSIX IPC required)."); +#else auto cu_stream = seq_lens_this_time.stream(); const int bsz = seq_lens_this_time.shape()[0]; const int block_num_per_seq = block_tables.shape()[1]; @@ -328,6 +334,7 @@ void SpeculateStepSchedule( printf("full msg buffer\n"); } } +#endif } PD_BUILD_STATIC_OP(speculate_step_reschedule) diff --git a/custom_ops/gpu_ops/step_reschedule.cu b/custom_ops/gpu_ops/step_reschedule.cu index 24d37104d0f..15a5b520575 100644 --- a/custom_ops/gpu_ops/step_reschedule.cu +++ b/custom_ops/gpu_ops/step_reschedule.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Hackathon 10th Spring No.46 — compilation guards #include "custom_ftok.h" #include "helper.h" #include "save_with_output_msg.h" @@ -211,6 +212,11 @@ void Schedule(const paddle::Tensor &stop_flags, const paddle::Tensor &first_token_ids, const int block_size, const int encoder_decoder_block_num) { +#ifdef _WIN32 + PD_THROW( + "Schedule is not supported on Windows " + "(POSIX IPC required)."); +#else auto cu_stream = seq_lens_this_time.stream(); const int bsz = seq_lens_this_time.shape()[0]; const int block_num_per_seq = block_tables.shape()[1]; @@ -314,6 +320,7 @@ void Schedule(const paddle::Tensor &stop_flags, printf("full msg buffer\n"); } } +#endif } PD_BUILD_STATIC_OP(step_reschedule) diff --git a/custom_ops/gpu_ops/stop_generation.cu b/custom_ops/gpu_ops/stop_generation.cu index 1dcb3c3aae5..55464b7c86e 100644 --- a/custom_ops/gpu_ops/stop_generation.cu +++ b/custom_ops/gpu_ops/stop_generation.cu @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +// Hackathon 10th Spring No.46 — compilation guards #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include "paddle/extension.h" #ifndef PD_BUILD_STATIC_OP @@ -84,6 +87,11 @@ std::vector GetStopFlags(const paddle::Tensor &topk_ids, auto stop_flags_out = stop_flags.copy_to(stop_flags.place(), false); // gpu -> gpu if (mode == 0 || mode == 1) { +#ifdef _WIN32 + PD_THROW( + "StopGeneration mode 0/1 is not supported on Windows " + "(POSIX mmap required)."); +#else constexpr char *path = "/root/paddlejob/workspace/env_run/lzy/ERNIE_ALL/" "ERNIE3.0-fused-fp16/ops/test"; @@ -121,6 +129,7 @@ std::vector GetStopFlags(const paddle::Tensor &topk_ids, bs_now, end_id); } +#endif } else if (mode == 2) { int block_size = (bs_now + 32 - 1) / 32 * 32; set_value_by_flags<<<1, block_size, 0, cu_stream>>>( diff --git a/custom_ops/gpu_ops/stop_generation_multi_ends.cu b/custom_ops/gpu_ops/stop_generation_multi_ends.cu index 06cf99831d7..eb923342610 100644 --- a/custom_ops/gpu_ops/stop_generation_multi_ends.cu +++ b/custom_ops/gpu_ops/stop_generation_multi_ends.cu @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include +// Hackathon 10th Spring No.46 — compilation guards #include #include #include -#include #include #include +#ifndef _WIN32 +#include +#include #include +#endif #include "helper.h" #include "paddle/extension.h"