From fa8e3b9661287a7aef47c45c0511053739090e6d Mon Sep 17 00:00:00 2001 From: thesues Date: Thu, 1 May 2025 23:17:38 +0000 Subject: [PATCH 01/12] Client done --- infinistore/example/client_async.py | 3 +- src/Makefile | 2 +- src/libinfinistore.cpp | 511 ++++++++++++++-------------- src/libinfinistore.h | 41 +-- 4 files changed, 284 insertions(+), 273 deletions(-) diff --git a/infinistore/example/client_async.py b/infinistore/example/client_async.py index aef479e..457b86a 100644 --- a/infinistore/example/client_async.py +++ b/infinistore/example/client_async.py @@ -26,7 +26,8 @@ async def main(): rdma_conn = infinistore.InfinityConnection(config) # FIXME: This is a blocking call, should be async - await rdma_conn.connect_async() + # await rdma_conn.connect_async() + rdma_conn.connect() src_tensor = torch.tensor( [i for i in range(4096)], device="cpu", dtype=torch.float32 diff --git a/src/Makefile b/src/Makefile index 7fda7af..bb22870 100644 --- a/src/Makefile +++ b/src/Makefile @@ -32,7 +32,7 @@ manylinux: $(PYBIND_TARGET) -include $(OBJECTS:.o=.d) -$(PYBIND_TARGET): pybind.cpp libinfinistore.o utils.o protocol.o infinistore.o log.o ibv_helper.o mempool.o +$(PYBIND_TARGET): pybind.cpp libinfinistore.o utils.o protocol.o infinistore.o log.o ibv_helper.o mempool.o rdma.o $(CXX) $(CXXFLAGS) $(INCLUDES) --shared -fPIC $(PYBIND11_INCLUDES) $^ \ -o $(PYBIND_TARGET) $(LDFLAGS) $(LIBS) rm -rf ../infinistore/$(PYBIND_TARGET) diff --git a/src/libinfinistore.cpp b/src/libinfinistore.cpp index 9bc8e6d..eaa5200 100644 --- a/src/libinfinistore.cpp +++ b/src/libinfinistore.cpp @@ -13,9 +13,9 @@ #include #include "config.h" -#include "ibv_helper.h" #include "log.h" #include "protocol.h" +#include "rdma.h" #include "utils.h" SendBuffer::SendBuffer(struct ibv_pd *pd, size_t size) { @@ -49,7 +49,7 @@ void Connection::close_conn() { stop_ = true; // create fake wr to wake up cq thread - ibv_req_notify_cq(cq_, 0); + ibv_req_notify_cq(ctx_.cq, 0); struct ibv_sge sge; memset(&sge, 0, sizeof(sge)); sge.addr = (uintptr_t)this; @@ -66,7 +66,7 @@ void Connection::close_conn() { struct ibv_send_wr *bad_send_wr; - ibv_post_send(qp_, &send_wr, &bad_send_wr); + ibv_post_send(ctx_.qp, &send_wr, &bad_send_wr); // wait thread done cq_future_.get(); @@ -96,6 +96,8 @@ Connection::~Connection() { } local_mr_.clear(); + destroy_rdma_context(&ctx_); + // if (recv_mr_) { // ibv_dereg_mr(recv_mr_); // } @@ -104,186 +106,187 @@ Connection::~Connection() { // free(recv_buffer_); // } - if (qp_) { - struct ibv_qp_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RESET; - ibv_modify_qp(qp_, &attr, IBV_QP_STATE); - } - if (qp_) { - ibv_destroy_qp(qp_); - } - if (cq_) { - ibv_destroy_cq(cq_); - } - - if (comp_channel_) { - ibv_destroy_comp_channel(comp_channel_); - } - if (pd_) { - ibv_dealloc_pd(pd_); - } - if (ib_ctx_) { - ibv_close_device(ib_ctx_); - } -} - -int Connection::init_rdma_resources(client_config_t config) { - // Get list of RDMA devices - struct ibv_device **dev_list; - struct ibv_device *ib_dev; - int num_devices; - - dev_list = ibv_get_device_list(&num_devices); - if (!dev_list) { - ERROR("Failed to get RDMA devices list"); - return -1; - } - - for (int i = 0; i < num_devices; ++i) { - char *dev_name_from_list = (char *)ibv_get_device_name(dev_list[i]); - if (strcmp(dev_name_from_list, config.dev_name.c_str()) == 0) { - INFO("found device {}", dev_name_from_list); - ib_dev = dev_list[i]; - ib_ctx_ = ibv_open_device(ib_dev); - break; - } - } - - if (!ib_ctx_) { - INFO( - "Can't find or failed to open the specified device, try to open " - "the default device {}", - (char *)ibv_get_device_name(dev_list[0])); - ib_ctx_ = ibv_open_device(dev_list[0]); - if (!ib_ctx_) { - ERROR("Failed to open the default device"); - return -1; - } - } - ibv_free_device_list(dev_list); - - struct ibv_port_attr port_attr; - ib_port_ = config.ib_port; - if (ibv_query_port(ib_ctx_, ib_port_, &port_attr)) { - ERROR("Unable to query port {} attributes\n", ib_port_); - return -1; - } - - int gidx = 0; - if ((port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && config.link_type == "Ethernet") || - (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && config.link_type == "IB")) { - ERROR("port link layer and config link type don't match"); - return -1; - } - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { - gidx = -1; - } - else { - gidx = ibv_find_sgid_type(ib_ctx_, ib_port_, IBV_GID_TYPE_ROCE_V2, AF_INET); - if (gidx < 0) { - ERROR("Failed to find GID"); - return -1; - } - } - - lid_ = port_attr.lid; - gidx_ = gidx; - - active_mtu_ = port_attr.active_mtu; - - union ibv_gid gid; - // get gid - if (gidx_ != -1 && ibv_query_gid(ib_ctx_, 1, gidx_, &gid)) { - ERROR("Failed to get GID"); - return -1; - } - - // Allocate Protection Domain - pd_ = ibv_alloc_pd(ib_ctx_); - if (!pd_) { - ERROR("Failed to allocate PD"); - return -1; - } - - comp_channel_ = ibv_create_comp_channel(ib_ctx_); - if (!comp_channel_) { - ERROR("Failed to create completion channel"); - return -1; - } - - // Create Completion Queue - cq_ = ibv_create_cq(ib_ctx_, MAX_SEND_WR + MAX_RECV_WR, NULL, comp_channel_, 0); - if (!cq_) { - ERROR("Failed to create CQ"); - return -1; - } - - if (ibv_req_notify_cq(cq_, 0)) { - ERROR("Failed to request CQ notification"); - return -1; - } - - // Create Queue Pair - struct ibv_qp_init_attr qp_init_attr = {}; - qp_init_attr.send_cq = cq_; - qp_init_attr.recv_cq = cq_; - qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection - qp_init_attr.cap.max_send_wr = MAX_SEND_WR; - qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; - qp_init_attr.cap.max_send_sge = 1; - qp_init_attr.cap.max_recv_sge = 1; - - qp_ = ibv_create_qp(pd_, &qp_init_attr); - if (!qp_) { - ERROR("Failed to create QP, {}", strerror(errno)); - return -1; - } - - // Modify QP to INIT state - if (modify_qp_to_init()) { - ERROR("Failed to modify QP to INIT, {}", strerror(errno)); - return -1; - } - - local_info_.qpn = qp_->qp_num; - local_info_.psn = lrand48() & 0xffffff; - if (gidx != -1) { - local_info_.gid = gid; - DEBUG("gid index: {}", gidx); - } - local_info_.lid = lid_; - - local_info_.mtu = (uint32_t)active_mtu_; + // if (qp_) { + // struct ibv_qp_attr attr; + // memset(&attr, 0, sizeof(attr)); + // attr.qp_state = IBV_QPS_RESET; + // ibv_modify_qp(qp_, &attr, IBV_QP_STATE); + // } + // if (qp_) { + // ibv_destroy_qp(qp_); + // } + // if (cq_) { + // ibv_destroy_cq(cq_); + // } - print_rdma_conn_info(&local_info_, false); - return 0; + // if (comp_channel_) { + // ibv_destroy_comp_channel(comp_channel_); + // } + // if (pd_) { + // ibv_dealloc_pd(pd_); + // } + // if (ib_ctx_) { + // ibv_close_device(ib_ctx_); + // } } -int Connection::modify_qp_to_init() { - struct ibv_qp_attr attr = {}; - attr.qp_state = IBV_QPS_INIT; - attr.port_num = ib_port_; - attr.pkey_index = 0; - attr.qp_access_flags = - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; - - int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; - - int ret = ibv_modify_qp(qp_, &attr, flags); - if (ret) { - ERROR("Failed to modify QP to INIT"); - return ret; - } - return 0; -} +// int Connection::init_rdma_resources(client_config_t config) { +// // Get list of RDMA devices +// struct ibv_device **dev_list; +// struct ibv_device *ib_dev; +// int num_devices; + +// dev_list = ibv_get_device_list(&num_devices); +// if (!dev_list) { +// ERROR("Failed to get RDMA devices list"); +// return -1; +// } + +// for (int i = 0; i < num_devices; ++i) { +// char *dev_name_from_list = (char *)ibv_get_device_name(dev_list[i]); +// if (strcmp(dev_name_from_list, config.dev_name.c_str()) == 0) { +// INFO("found device {}", dev_name_from_list); +// ib_dev = dev_list[i]; +// ib_ctx_ = ibv_open_device(ib_dev); +// break; +// } +// } + +// if (!ib_ctx_) { +// INFO( +// "Can't find or failed to open the specified device, try to open " +// "the default device {}", +// (char *)ibv_get_device_name(dev_list[0])); +// ib_ctx_ = ibv_open_device(dev_list[0]); +// if (!ib_ctx_) { +// ERROR("Failed to open the default device"); +// return -1; +// } +// } +// ibv_free_device_list(dev_list); + +// struct ibv_port_attr port_attr; +// ib_port_ = config.ib_port; +// if (ibv_query_port(ib_ctx_, ib_port_, &port_attr)) { +// ERROR("Unable to query port {} attributes\n", ib_port_); +// return -1; +// } + +// int gidx = 0; +// if ((port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && config.link_type == "Ethernet") || +// (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && config.link_type == "IB")) { +// ERROR("port link layer and config link type don't match"); +// return -1; +// } +// if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { +// gidx = -1; +// } +// else { +// gidx = ibv_find_sgid_type(ib_ctx_, ib_port_, IBV_GID_TYPE_ROCE_V2, AF_INET); +// if (gidx < 0) { +// ERROR("Failed to find GID"); +// return -1; +// } +// } + +// lid_ = port_attr.lid; +// gidx_ = gidx; + +// active_mtu_ = port_attr.active_mtu; + +// union ibv_gid gid; +// // get gid +// if (gidx_ != -1 && ibv_query_gid(ib_ctx_, 1, gidx_, &gid)) { +// ERROR("Failed to get GID"); +// return -1; +// } + +// // Allocate Protection Domain +// pd_ = ibv_alloc_pd(ib_ctx_); +// if (!pd_) { +// ERROR("Failed to allocate PD"); +// return -1; +// } + +// comp_channel_ = ibv_create_comp_channel(ib_ctx_); +// if (!comp_channel_) { +// ERROR("Failed to create completion channel"); +// return -1; +// } + +// // Create Completion Queue +// cq_ = ibv_create_cq(ib_ctx_, MAX_SEND_WR + MAX_RECV_WR, NULL, comp_channel_, 0); +// if (!cq_) { +// ERROR("Failed to create CQ"); +// return -1; +// } + +// if (ibv_req_notify_cq(cq_, 0)) { +// ERROR("Failed to request CQ notification"); +// return -1; +// } + +// // Create Queue Pair +// struct ibv_qp_init_attr qp_init_attr = {}; +// qp_init_attr.send_cq = cq_; +// qp_init_attr.recv_cq = cq_; +// qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection +// qp_init_attr.cap.max_send_wr = MAX_SEND_WR; +// qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; +// qp_init_attr.cap.max_send_sge = 1; +// qp_init_attr.cap.max_recv_sge = 1; + +// qp_ = ibv_create_qp(pd_, &qp_init_attr); +// if (!qp_) { +// ERROR("Failed to create QP, {}", strerror(errno)); +// return -1; +// } + +// // Modify QP to INIT state +// if (modify_qp_to_init()) { +// ERROR("Failed to modify QP to INIT, {}", strerror(errno)); +// return -1; +// } + +// local_info_.qpn = qp_->qp_num; +// local_info_.psn = lrand48() & 0xffffff; +// if (gidx != -1) { +// local_info_.gid = gid; +// DEBUG("gid index: {}", gidx); +// } +// local_info_.lid = lid_; + +// local_info_.mtu = (uint32_t)active_mtu_; + +// print_rdma_conn_info(&local_info_, false); +// return 0; +// } + +// int Connection::modify_qp_to_init() { +// struct ibv_qp_attr attr = {}; +// attr.qp_state = IBV_QPS_INIT; +// attr.port_num = ib_port_; +// attr.pkey_index = 0; +// attr.qp_access_flags = +// IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; + +// int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; + +// int ret = ibv_modify_qp(qp_, &attr, flags); +// if (ret) { +// ERROR("Failed to modify QP to INIT"); +// return ret; +// } +// return 0; +// } void Connection::cq_handler() { - assert(comp_channel_ != NULL); + assert(ctx_.comp_channel != NULL); + while (!stop_) { struct ibv_cq *ev_cq; void *ev_ctx; - int ret = ibv_get_cq_event(comp_channel_, &ev_cq, &ev_ctx); + int ret = ibv_get_cq_event(ctx_.comp_channel, &ev_cq, &ev_ctx); if (ret == 0) { ibv_ack_cq_events(ev_cq, 1); if (ibv_req_notify_cq(ev_cq, 0)) { @@ -293,7 +296,7 @@ void Connection::cq_handler() { struct ibv_wc wc[10] = {}; int num_completions; - while ((num_completions = ibv_poll_cq(cq_, 10, wc)) && num_completions > 0) { + while ((num_completions = ibv_poll_cq(ctx_.cq, 10, wc)) && num_completions > 0) { for (int i = 0; i < num_completions; i++) { if (wc[i].status != IBV_WC_SUCCESS) { // only fake wr will use IBV_WC_SEND @@ -369,8 +372,13 @@ SendBuffer *Connection::get_send_buffer() { void Connection::release_send_buffer(SendBuffer *buffer) { send_buffers_.push(buffer); } int Connection::setup_rdma(client_config_t config) { - if (init_rdma_resources(config) < 0) { - ERROR("Failed to initialize RDMA resources"); + // if (init_rdma_resources(config) < 0) { + // ERROR("Failed to initialize RDMA resources"); + // return -1; + // } + + if (init_rdma_context(config.dev_name, config.ib_port, config.link_type, &ctx_) < 0) { + ERROR("Failed to initialize RDMA context"); return -1; } @@ -379,15 +387,16 @@ int Connection::setup_rdma(client_config_t config) { return -1; } - print_rdma_conn_info(&remote_info_, true); + print_rdma_conn_info(&ctx_.remote_info, true); + print_rdma_conn_info(&ctx_.local_info, false); // Modify QP to RTR state - if (modify_qp_to_rtr()) { + if (modify_qp_to_rtr(&ctx_)) { ERROR("Failed to modify QP to RTR"); return -1; } - if (modify_qp_to_rts()) { + if (modify_qp_to_rts(&ctx_)) { ERROR("Failed to modify QP to RTS"); return -1; } @@ -397,7 +406,7 @@ int Connection::setup_rdma(client_config_t config) { because server also has the same number of buffers */ for (int i = 0; i < MAX_RECV_WR; i++) { - send_buffers_.push(new SendBuffer(pd_, PROTOCOL_BUFFER_SIZE)); + send_buffers_.push(new SendBuffer(ctx_.pd, PROTOCOL_BUFFER_SIZE)); } stop_ = false; @@ -436,69 +445,69 @@ int Connection::init_connection(client_config_t config) { return 0; } -int Connection::modify_qp_to_rtr() { - struct ibv_qp_attr attr = {}; - attr.qp_state = IBV_QPS_RTR; - - // update MTU - if (remote_info_.mtu != active_mtu_) { - WARN("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", - 1 << ((uint32_t)remote_info_.mtu + 7), 1 << ((uint32_t)active_mtu_ + 7)); - } - attr.path_mtu = (enum ibv_mtu)std::min((uint32_t)active_mtu_, (uint32_t)remote_info_.mtu); - - attr.dest_qp_num = remote_info_.qpn; - attr.rq_psn = remote_info_.psn; - attr.max_dest_rd_atomic = 16; - attr.min_rnr_timer = 12; - attr.ah_attr.dlid = 0; - attr.ah_attr.sl = 0; - attr.ah_attr.src_path_bits = 0; - attr.ah_attr.port_num = ib_port_; - - if (gidx_ == -1) { - // IB - attr.ah_attr.dlid = remote_info_.lid; - attr.ah_attr.is_global = 0; - } - else { - // RoCE v2 - attr.ah_attr.is_global = 1; - attr.ah_attr.grh.dgid = remote_info_.gid; - attr.ah_attr.grh.sgid_index = gidx_; // local gid - attr.ah_attr.grh.hop_limit = 1; - } - - int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; - - int ret = ibv_modify_qp(qp_, &attr, flags); - if (ret) { - ERROR("Failed to modify QP to RTR"); - return ret; - } - return 0; -} - -int Connection::modify_qp_to_rts() { - struct ibv_qp_attr attr = {}; - attr.qp_state = IBV_QPS_RTS; - attr.timeout = 14; - attr.retry_cnt = 7; - attr.rnr_retry = 7; - attr.sq_psn = local_info_.psn; // Use 0 or match with local PSN - attr.max_rd_atomic = 16; - - int flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | - IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; - - int ret = ibv_modify_qp(qp_, &attr, flags); - if (ret) { - ERROR("Failed to modify QP to RTS"); - return ret; - } - return 0; -} +// int Connection::modify_qp_to_rtr() { +// struct ibv_qp_attr attr = {}; +// attr.qp_state = IBV_QPS_RTR; + +// // update MTU +// if (remote_info_.mtu != active_mtu_) { +// WARN("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", +// 1 << ((uint32_t)remote_info_.mtu + 7), 1 << ((uint32_t)active_mtu_ + 7)); +// } +// attr.path_mtu = (enum ibv_mtu)std::min((uint32_t)active_mtu_, (uint32_t)remote_info_.mtu); + +// attr.dest_qp_num = remote_info_.qpn; +// attr.rq_psn = remote_info_.psn; +// attr.max_dest_rd_atomic = 16; +// attr.min_rnr_timer = 12; +// attr.ah_attr.dlid = 0; +// attr.ah_attr.sl = 0; +// attr.ah_attr.src_path_bits = 0; +// attr.ah_attr.port_num = ib_port_; + +// if (gidx_ == -1) { +// // IB +// attr.ah_attr.dlid = remote_info_.lid; +// attr.ah_attr.is_global = 0; +// } +// else { +// // RoCE v2 +// attr.ah_attr.is_global = 1; +// attr.ah_attr.grh.dgid = remote_info_.gid; +// attr.ah_attr.grh.sgid_index = gidx_; // local gid +// attr.ah_attr.grh.hop_limit = 1; +// } + +// int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | +// IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; + +// int ret = ibv_modify_qp(qp_, &attr, flags); +// if (ret) { +// ERROR("Failed to modify QP to RTR"); +// return ret; +// } +// return 0; +// } + +// int Connection::modify_qp_to_rts() { +// struct ibv_qp_attr attr = {}; +// attr.qp_state = IBV_QPS_RTS; +// attr.timeout = 14; +// attr.retry_cnt = 7; +// attr.rnr_retry = 7; +// attr.sq_psn = local_info_.psn; // Use 0 or match with local PSN +// attr.max_rd_atomic = 16; + +// int flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | +// IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; + +// int ret = ibv_modify_qp(qp_, &attr, flags); +// if (ret) { +// ERROR("Failed to modify QP to RTS"); +// return ret; +// } +// return 0; +// } int Connection::exchange_conn_info() { header_t header = { @@ -512,7 +521,7 @@ int Connection::exchange_conn_info() { iov[0].iov_base = &header; iov[0].iov_len = FIXED_HEADER_SIZE; - iov[1].iov_base = &local_info_; + iov[1].iov_base = &ctx_.local_info; iov[1].iov_len = sizeof(rdma_conn_info_t); memset(&msg, 0, sizeof(msg)); @@ -535,7 +544,7 @@ int Connection::exchange_conn_info() { return -1; } - if (recv(sock_, &remote_info_, sizeof(rdma_conn_info_t), MSG_WAITALL) != + if (recv(sock_, &ctx_.remote_info, sizeof(rdma_conn_info_t), MSG_WAITALL) != sizeof(rdma_conn_info_t)) { ERROR("Failed to receive remote connection information"); return -1; @@ -704,7 +713,7 @@ void Connection::post_recv_ack(rdma_info_base *info) { recv_wr.sg_list = NULL; recv_wr.num_sge = 0; - int ret = ibv_post_recv(qp_, &recv_wr, &bad_recv_wr); + int ret = ibv_post_recv(ctx_.qp, &recv_wr, &bad_recv_wr); if (ret) { ERROR("Failed to post recv wr :{}", strerror(ret)); } @@ -873,7 +882,7 @@ int Connection::w_rdma_async(const std::vector &keys, wr.num_sge = 1; wr.send_flags = IBV_SEND_SIGNALED; - int ret = ibv_post_send(qp_, &wr, &bad_wr); + int ret = ibv_post_send(ctx_.qp, &wr, &bad_wr); if (ret) { ERROR("Failed to post RDMA send :{}", strerror(ret)); return -1; @@ -941,7 +950,7 @@ int Connection::r_rdma_async(const std::vector &keys, wr.send_flags = IBV_SEND_SIGNALED; int ret; - ret = ibv_post_send(qp_, &wr, &bad_wr); + ret = ibv_post_send(ctx_.qp, &wr, &bad_wr); if (ret) { ERROR("Failed to post RDMA send :{}", strerror(ret)); @@ -958,7 +967,7 @@ int Connection::register_mr(void *base_ptr, size_t ptr_region_size) { ibv_dereg_mr(local_mr_[(uintptr_t)base_ptr]); } struct ibv_mr *mr; - mr = ibv_reg_mr(pd_, base_ptr, ptr_region_size, + mr = ibv_reg_mr(ctx_.pd, base_ptr, ptr_region_size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr) { ERROR("Failed to register memory regions, size: {}", ptr_region_size); diff --git a/src/libinfinistore.h b/src/libinfinistore.h index abed0eb..aa42c6b 100644 --- a/src/libinfinistore.h +++ b/src/libinfinistore.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include @@ -18,6 +17,7 @@ #include "config.h" #include "log.h" #include "protocol.h" +#include "rdma.h" // RDMA send buffer // because write_cache will be invoked asynchronously, @@ -65,20 +65,21 @@ class Connection { int sock_ = 0; // rdma connections - struct ibv_context *ib_ctx_ = NULL; - struct ibv_pd *pd_ = NULL; - struct ibv_cq *cq_ = NULL; - struct ibv_qp *qp_ = NULL; - int gidx_ = -1; - int lid_ = -1; - uint8_t ib_port_ = -1; - - // local active_mtu attr, after exchanging with remote, we will use the min of the two for - // path.mtu - ibv_mtu active_mtu_; - - rdma_conn_info_t local_info_; - rdma_conn_info_t remote_info_; + // struct ibv_context *ib_ctx_ = NULL; + // struct ibv_pd *pd_ = NULL; + // struct ibv_cq *cq_ = NULL; + // struct ibv_qp *qp_ = NULL; + // int gidx_ = -1; + // int lid_ = -1; + // uint8_t ib_port_ = -1; + + // // local active_mtu attr, after exchanging with remote, we will use the min of the two for + // // path.mtu + // ibv_mtu active_mtu_; + struct rdma_context ctx_; + + // rdma_conn_info_t local_info_; + // rdma_conn_info_t remote_info_; std::unordered_map local_mr_; @@ -88,7 +89,7 @@ class Connection { */ boost::lockfree::spsc_queue send_buffers_{MAX_RECV_WR}; - struct ibv_comp_channel *comp_channel_ = NULL; + // struct ibv_comp_channel *comp_channel_ = NULL; std::future cq_future_; // cq thread std::atomic stop_{false}; @@ -115,11 +116,11 @@ class Connection { int delete_keys(const std::vector &keys); int register_mr(void *base_ptr, size_t ptr_region_size); - int modify_qp_to_init(); - int modify_qp_to_rts(); - int modify_qp_to_rtr(); + // int modify_qp_to_init(); + // int modify_qp_to_rts(); + // int modify_qp_to_rtr(); int exchange_conn_info(); - int init_rdma_resources(client_config_t config); + // int init_rdma_resources(client_config_t config); void post_recv_ack(rdma_info_base *info); From 516dde5e3eb63e7dfb6b327db01ca6fbf6f4fba5 Mon Sep 17 00:00:00 2001 From: thesues Date: Fri, 2 May 2025 04:33:54 +0000 Subject: [PATCH 02/12] WIP --- src/infinistore.cpp | 1 + src/rdma.cpp | 259 ++++++++++++++++++++++++++++++++++++++++++++ src/rdma.h | 45 ++++++++ 3 files changed, 305 insertions(+) create mode 100644 src/rdma.cpp create mode 100644 src/rdma.h diff --git a/src/infinistore.cpp b/src/infinistore.cpp index 50bc574..02efe65 100644 --- a/src/infinistore.cpp +++ b/src/infinistore.cpp @@ -21,6 +21,7 @@ #include "ibv_helper.h" #include "protocol.h" +#include "rdma.h" server_config_t global_config; diff --git a/src/rdma.cpp b/src/rdma.cpp new file mode 100644 index 0000000..b63e672 --- /dev/null +++ b/src/rdma.cpp @@ -0,0 +1,259 @@ +#include "rdma.h" + +#include + +#include "log.h" + +int close_rdma_device(struct rdma_device *rdma_dev) { + if (rdma_dev->pd) { + ibv_dealloc_pd(rdma_dev->pd); + } + if (rdma_dev->ib_ctx) { + ibv_close_device(rdma_dev->ib_ctx); + } + return 0; +} + +int destroy_rdma_context(struct rdma_context *ctx) { + assert(ctx->rdma_dev != NULL, "destroy_rdma_context should be called before close_rdma_device"); + + if (ctx->qp) { + struct ibv_qp_attr attr; + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IBV_QPS_RESET; + ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE); + ibv_destroy_qp(ctx->qp); + } + + if (ctx->cq) { + ibv_destroy_cq(ctx->cq); + } + + if (ctx->comp_channel) { + ibv_destroy_comp_channel(ctx->comp_channel); + } + return 0; +} + +int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, + struct rdma_device *rdma_dev) { + assert(link_type == "IB" || link_type == "Ethernet"); + assert(rdma_dev != NULL); + + rdma_dev->link_type = link_type; + + struct ibv_device **dev_list; + struct ibv_device *ib_dev; + int num_devices; + dev_list = ibv_get_device_list(&num_devices); + if (!dev_list) { + ERROR("Failed to get RDMA devices list"); + return -1; + } + + for (int i = 0; i < num_devices; ++i) { + char *dev_name_from_list = (char *)ibv_get_device_name(dev_list[i]); + if (strcmp(dev_name_from_list, dev_name.c_str()) == 0) { + INFO("found device {}", dev_name_from_list); + ib_dev = dev_list[i]; + rdma_dev->ib_ctx = ibv_open_device(ib_dev); + break; + } + } + + if (!rdma_dev->ib_ctx) { + INFO( + "Can't find or failed to open the specified device, try to open " + "the default device {}", + (char *)ibv_get_device_name(dev_list[0])); + rdma_dev->ib_ctx = ibv_open_device(dev_list[0]); + if (!rdma_dev->ib_ctx) { + ERROR("Failed to open the default device"); + return -1; + } + } + + struct ibv_port_attr port_attr; + rdma_dev->ib_port = ib_port; + if (ibv_query_port(rdma_dev->ib_ctx, rdma_dev->ib_port, &port_attr)) { + ERROR("Unable to query port {} attributes\n", rdma_dev->ib_port); + return -1; + } + if ((port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && link_type == "Ethernet") || + (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && link_type == "IB")) { + ERROR("port link layer and config link type don't match"); + return -1; + } + if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { + rdma_dev->gid_index = -1; + } + else { + rdma_dev->gid_index = + ibv_find_sgid_type(rdma_dev->ib_ctx, rdma_dev->ib_port, IBV_GID_TYPE_ROCE_V2, AF_INET); + if (rdma_dev->gid_index < 0) { + ERROR("Failed to find GID"); + return -1; + } + } + + rdma_dev->lid = port_attr.lid; + rdma_dev->active_mtu = port_attr.active_mtu; + + union ibv_gid gid; + // get gid + if (rdma_dev->gid_index != -1 && + ibv_query_gid(rdma_dev->ib_ctx, 1, rdma_dev->gid_index, &gid)) { + ERROR("Failed to get GID"); + return -1; + } + + rdma_dev->pd = ibv_alloc_pd(rdma_dev->ib_ctx); + if (!rdma_dev->pd) { + ERROR("Failed to allocate PD"); + return -1; + } +} + +int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev) { + assert(ctx != NULL); + assert(rdma_dev != NULL); + + // work like a weak_ptr + ctx->rdma_dev = rdma_dev; + + ctx->comp_channel = ibv_create_comp_channel(rdma_dev->ib_ctx); + if (!ctx->comp_channel) { + ERROR("Failed to create completion channel"); + return -1; + } + + // Create Completion Queue + ctx->cq = + ibv_create_cq(rdma_dev->ib_ctx, MAX_SEND_WR + MAX_RECV_WR, NULL, ctx->comp_channel, 0); + if (!ctx->cq) { + ERROR("Failed to create CQ"); + return -1; + } + + if (ibv_req_notify_cq(ctx->cq, 0)) { + ERROR("Failed to request CQ notification"); + return -1; + } + + // Create Queue Pair + struct ibv_qp_init_attr qp_init_attr = {}; + qp_init_attr.send_cq = ctx->cq; + qp_init_attr.recv_cq = ctx->cq; + qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection + qp_init_attr.cap.max_send_wr = MAX_SEND_WR; + qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + + ctx->qp = ibv_create_qp(rdma_dev->pd, &qp_init_attr); + if (!ctx->qp) { + ERROR("Failed to create QP, {}", strerror(errno)); + return -1; + } + + // Modify QP to INIT state + if (modify_qp_to_init(ctx)) { + ERROR("Failed to modify QP to INIT, {}", strerror(errno)); + return -1; + } + + // save information to local_info for exchange data + ctx->local_info.qpn = ctx->qp->qp_num; + ctx->local_info.psn = lrand48() & 0xffffff; + if (rdma_dev->gid_index != -1) { + ctx->local_info.gid = gid; + } + + ctx->local_info.lid = rdma_dev->lid; + ctx->local_info.mtu = (uint32_t)rdma_dev->active_mtu; + return 0; +} + +int modify_qp_to_init(struct rdma_context *ctx) { + struct ibv_qp_attr attr = {}; + attr.qp_state = IBV_QPS_INIT; + attr.port_num = ctx->ib_port; + attr.pkey_index = 0; + attr.qp_access_flags = + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; + + int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; + + int ret = ibv_modify_qp(ctx->qp, &attr, flags); + if (ret) { + ERROR("Failed to modify QP to INIT"); + return ret; + } + return 0; +} + +int modify_qp_to_rts(struct rdma_context *ctx) { + struct ibv_qp_attr attr = {}; + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = ctx->local_info.psn; // Use 0 or match with local PSN + attr.max_rd_atomic = 16; + + int flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; + + int ret = ibv_modify_qp(ctx->qp, &attr, flags); + if (ret) { + ERROR("Failed to modify QP to RTS"); + return ret; + } + return 0; +} + +int modify_qp_to_rtr(struct rdma_context *ctx) { + struct ibv_qp_attr attr = {}; + attr.qp_state = IBV_QPS_RTR; + + // update MTU + if (ctx->remote_info.mtu != (uint32_t)ctx->active_mtu) { + INFO("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", + 1 << ((uint32_t)ctx->remote_info.mtu + 7), 1 << ((uint32_t)ctx->active_mtu + 7)); + } + + attr.path_mtu = + (enum ibv_mtu)std::min((uint32_t)ctx->active_mtu, (uint32_t)ctx->remote_info.mtu); + + attr.dest_qp_num = ctx->remote_info.qpn; + attr.rq_psn = ctx->remote_info.psn; + attr.max_dest_rd_atomic = 16; + attr.min_rnr_timer = 12; + attr.ah_attr.dlid = 0; + attr.ah_attr.sl = 0; + attr.ah_attr.src_path_bits = 0; + attr.ah_attr.port_num = ctx->ib_port; + + if (ctx->gid_index == -1) { + // IB + attr.ah_attr.dlid = ctx->remote_info.lid; + attr.ah_attr.is_global = 0; + } + else { + // RoCE v2 + attr.ah_attr.is_global = 1; + attr.ah_attr.grh.dgid = ctx->remote_info.gid; + attr.ah_attr.grh.sgid_index = ctx->gid_index; // local gid + attr.ah_attr.grh.hop_limit = 1; + } + + int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; + + int ret = ibv_modify_qp(ctx->qp, &attr, flags); + if (ret) { + ERROR("Failed to modify QP to RTR"); + return ret; + } + return 0; +} diff --git a/src/rdma.h b/src/rdma.h new file mode 100644 index 0000000..aa9f4d2 --- /dev/null +++ b/src/rdma.h @@ -0,0 +1,45 @@ +#ifndef RDMA_H +#define RDMA_H +#include + +#include + +#include "ibv_helper.h" +#include "protocol.h" + +struct rdma_device { + struct ibv_context *ib_ctx; + struct ibv_pd *pd; + int ib_port; + int gid_index; + int lid; + std::string link_type; // IB or Ethernet + ibv_mtu active_mtu; +}; + +struct rdma_context { + struct rdma_device *rdma_dev; + + struct ibv_comp_channel *comp_channel; + struct ibv_cq *cq; + struct ibv_qp *qp; + + struct rdma_conn_info_t local_info; + struct rdma_conn_info_t remote_info; +}; + +// int init_rdma_context(std::string dev_name, int ib_port, std::string link_type, +// struct rdma_context *ctx); + +int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, + struct rdma_device *rdma_dev); +int init_rdma_context(std::string dev_name, int ib_port, std::string link_type, + struct rdma_device *rdma_dev, struct rdma_context *ctx); +int modify_qp_to_init(struct rdma_context *ctx); +int modify_qp_to_rts(struct rdma_context *ctx); +int modify_qp_to_rtr(struct rdma_context *ctx); + +int destroy_rdma_context(struct rdma_context *ctx); +int close_rdma_device(struct rdma_device *rdma_dev); + +#endif From 779843369a74a494b2aac7a99311427a777aa3a1 Mon Sep 17 00:00:00 2001 From: thesues Date: Fri, 2 May 2025 05:35:24 +0000 Subject: [PATCH 03/12] client ok --- src/libinfinistore.cpp | 14 +++++++++---- src/libinfinistore.h | 2 ++ src/rdma.cpp | 45 ++++++++++++++++++++++++------------------ src/rdma.h | 18 +++++++++-------- 4 files changed, 48 insertions(+), 31 deletions(-) diff --git a/src/libinfinistore.cpp b/src/libinfinistore.cpp index eaa5200..06b0c81 100644 --- a/src/libinfinistore.cpp +++ b/src/libinfinistore.cpp @@ -97,6 +97,7 @@ Connection::~Connection() { local_mr_.clear(); destroy_rdma_context(&ctx_); + close_rdma_device(&rdma_dev_); // if (recv_mr_) { // ibv_dereg_mr(recv_mr_); @@ -377,7 +378,12 @@ int Connection::setup_rdma(client_config_t config) { // return -1; // } - if (init_rdma_context(config.dev_name, config.ib_port, config.link_type, &ctx_) < 0) { + if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, &rdma_dev_) < 0) { + ERROR("Failed to open RDMA device"); + return -1; + } + + if (init_rdma_context(&ctx_, &rdma_dev_) < 0) { ERROR("Failed to initialize RDMA context"); return -1; } @@ -391,7 +397,7 @@ int Connection::setup_rdma(client_config_t config) { print_rdma_conn_info(&ctx_.local_info, false); // Modify QP to RTR state - if (modify_qp_to_rtr(&ctx_)) { + if (modify_qp_to_rtr(&ctx_, &rdma_dev_)) { ERROR("Failed to modify QP to RTR"); return -1; } @@ -406,7 +412,7 @@ int Connection::setup_rdma(client_config_t config) { because server also has the same number of buffers */ for (int i = 0; i < MAX_RECV_WR; i++) { - send_buffers_.push(new SendBuffer(ctx_.pd, PROTOCOL_BUFFER_SIZE)); + send_buffers_.push(new SendBuffer(rdma_dev_.pd, PROTOCOL_BUFFER_SIZE)); } stop_ = false; @@ -967,7 +973,7 @@ int Connection::register_mr(void *base_ptr, size_t ptr_region_size) { ibv_dereg_mr(local_mr_[(uintptr_t)base_ptr]); } struct ibv_mr *mr; - mr = ibv_reg_mr(ctx_.pd, base_ptr, ptr_region_size, + mr = ibv_reg_mr(rdma_dev_.pd, base_ptr, ptr_region_size, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr) { ERROR("Failed to register memory regions, size: {}", ptr_region_size); diff --git a/src/libinfinistore.h b/src/libinfinistore.h index aa42c6b..193282c 100644 --- a/src/libinfinistore.h +++ b/src/libinfinistore.h @@ -76,6 +76,8 @@ class Connection { // // local active_mtu attr, after exchanging with remote, we will use the min of the two for // // path.mtu // ibv_mtu active_mtu_; + + struct rdma_device rdma_dev_; struct rdma_context ctx_; // rdma_conn_info_t local_info_; diff --git a/src/rdma.cpp b/src/rdma.cpp index b63e672..9c0f016 100644 --- a/src/rdma.cpp +++ b/src/rdma.cpp @@ -5,6 +5,8 @@ #include "log.h" int close_rdma_device(struct rdma_device *rdma_dev) { + assert(rdma_dev != NULL); + if (rdma_dev->pd) { ibv_dealloc_pd(rdma_dev->pd); } @@ -15,8 +17,6 @@ int close_rdma_device(struct rdma_device *rdma_dev) { } int destroy_rdma_context(struct rdma_context *ctx) { - assert(ctx->rdma_dev != NULL, "destroy_rdma_context should be called before close_rdma_device"); - if (ctx->qp) { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(attr)); @@ -79,11 +79,13 @@ int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, ERROR("Unable to query port {} attributes\n", rdma_dev->ib_port); return -1; } + if ((port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && link_type == "Ethernet") || (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && link_type == "IB")) { ERROR("port link layer and config link type don't match"); return -1; } + if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { rdma_dev->gid_index = -1; } @@ -91,7 +93,7 @@ int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, rdma_dev->gid_index = ibv_find_sgid_type(rdma_dev->ib_ctx, rdma_dev->ib_port, IBV_GID_TYPE_ROCE_V2, AF_INET); if (rdma_dev->gid_index < 0) { - ERROR("Failed to find GID"); + ERROR("Failed to find GID index"); return -1; } } @@ -99,10 +101,9 @@ int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, rdma_dev->lid = port_attr.lid; rdma_dev->active_mtu = port_attr.active_mtu; - union ibv_gid gid; // get gid if (rdma_dev->gid_index != -1 && - ibv_query_gid(rdma_dev->ib_ctx, 1, rdma_dev->gid_index, &gid)) { + ibv_query_gid(rdma_dev->ib_ctx, 1, rdma_dev->gid_index, &rdma_dev->gid)) { ERROR("Failed to get GID"); return -1; } @@ -112,15 +113,13 @@ int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, ERROR("Failed to allocate PD"); return -1; } + return 0; } int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev) { assert(ctx != NULL); assert(rdma_dev != NULL); - // work like a weak_ptr - ctx->rdma_dev = rdma_dev; - ctx->comp_channel = ibv_create_comp_channel(rdma_dev->ib_ctx); if (!ctx->comp_channel) { ERROR("Failed to create completion channel"); @@ -157,7 +156,7 @@ int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev) { } // Modify QP to INIT state - if (modify_qp_to_init(ctx)) { + if (modify_qp_to_init(ctx, rdma_dev)) { ERROR("Failed to modify QP to INIT, {}", strerror(errno)); return -1; } @@ -166,7 +165,7 @@ int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev) { ctx->local_info.qpn = ctx->qp->qp_num; ctx->local_info.psn = lrand48() & 0xffffff; if (rdma_dev->gid_index != -1) { - ctx->local_info.gid = gid; + ctx->local_info.gid = rdma_dev->gid; } ctx->local_info.lid = rdma_dev->lid; @@ -174,10 +173,13 @@ int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev) { return 0; } -int modify_qp_to_init(struct rdma_context *ctx) { +int modify_qp_to_init(struct rdma_context *ctx, struct rdma_device *rdma_dev) { + assert(ctx != NULL); + assert(rdma_dev != NULL); + struct ibv_qp_attr attr = {}; attr.qp_state = IBV_QPS_INIT; - attr.port_num = ctx->ib_port; + attr.port_num = rdma_dev->ib_port; attr.pkey_index = 0; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; @@ -193,6 +195,8 @@ int modify_qp_to_init(struct rdma_context *ctx) { } int modify_qp_to_rts(struct rdma_context *ctx) { + assert(ctx != NULL); + struct ibv_qp_attr attr = {}; attr.qp_state = IBV_QPS_RTS; attr.timeout = 14; @@ -212,18 +216,21 @@ int modify_qp_to_rts(struct rdma_context *ctx) { return 0; } -int modify_qp_to_rtr(struct rdma_context *ctx) { +int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev) { + assert(ctx != NULL); + assert(rdma_dev != NULL); + struct ibv_qp_attr attr = {}; attr.qp_state = IBV_QPS_RTR; // update MTU - if (ctx->remote_info.mtu != (uint32_t)ctx->active_mtu) { + if (ctx->remote_info.mtu != (uint32_t)rdma_dev->active_mtu) { INFO("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", - 1 << ((uint32_t)ctx->remote_info.mtu + 7), 1 << ((uint32_t)ctx->active_mtu + 7)); + 1 << ((uint32_t)ctx->remote_info.mtu + 7), 1 << ((uint32_t)rdma_dev->active_mtu + 7)); } attr.path_mtu = - (enum ibv_mtu)std::min((uint32_t)ctx->active_mtu, (uint32_t)ctx->remote_info.mtu); + (enum ibv_mtu)std::min((uint32_t)rdma_dev->active_mtu, (uint32_t)ctx->remote_info.mtu); attr.dest_qp_num = ctx->remote_info.qpn; attr.rq_psn = ctx->remote_info.psn; @@ -232,9 +239,9 @@ int modify_qp_to_rtr(struct rdma_context *ctx) { attr.ah_attr.dlid = 0; attr.ah_attr.sl = 0; attr.ah_attr.src_path_bits = 0; - attr.ah_attr.port_num = ctx->ib_port; + attr.ah_attr.port_num = rdma_dev->ib_port; - if (ctx->gid_index == -1) { + if (rdma_dev->gid_index == -1) { // IB attr.ah_attr.dlid = ctx->remote_info.lid; attr.ah_attr.is_global = 0; @@ -243,7 +250,7 @@ int modify_qp_to_rtr(struct rdma_context *ctx) { // RoCE v2 attr.ah_attr.is_global = 1; attr.ah_attr.grh.dgid = ctx->remote_info.gid; - attr.ah_attr.grh.sgid_index = ctx->gid_index; // local gid + attr.ah_attr.grh.sgid_index = rdma_dev->gid_index; // local gid attr.ah_attr.grh.hop_limit = 1; } diff --git a/src/rdma.h b/src/rdma.h index aa9f4d2..5fb1647 100644 --- a/src/rdma.h +++ b/src/rdma.h @@ -12,13 +12,15 @@ struct rdma_device { struct ibv_pd *pd; int ib_port; int gid_index; + union ibv_gid gid; // RoCE v2 int lid; std::string link_type; // IB or Ethernet ibv_mtu active_mtu; + rdma_device() : ib_ctx(nullptr), pd(nullptr), ib_port(-1), gid_index(-1), lid(-1) {} }; struct rdma_context { - struct rdma_device *rdma_dev; + // struct rdma_device *rdma_dev; struct ibv_comp_channel *comp_channel; struct ibv_cq *cq; @@ -26,18 +28,18 @@ struct rdma_context { struct rdma_conn_info_t local_info; struct rdma_conn_info_t remote_info; + rdma_context() : comp_channel(nullptr), cq(nullptr), qp(nullptr) { + memset(&local_info, 0, sizeof(local_info)); + memset(&remote_info, 0, sizeof(remote_info)); + } }; -// int init_rdma_context(std::string dev_name, int ib_port, std::string link_type, -// struct rdma_context *ctx); - int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, struct rdma_device *rdma_dev); -int init_rdma_context(std::string dev_name, int ib_port, std::string link_type, - struct rdma_device *rdma_dev, struct rdma_context *ctx); -int modify_qp_to_init(struct rdma_context *ctx); +int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev); +int modify_qp_to_init(struct rdma_context *ctx, struct rdma_device *rdma_dev); int modify_qp_to_rts(struct rdma_context *ctx); -int modify_qp_to_rtr(struct rdma_context *ctx); +int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev); int destroy_rdma_context(struct rdma_context *ctx); int close_rdma_device(struct rdma_device *rdma_dev); From d0e5c6d003e7c9b2f1bd591b347a7952f08f35aa Mon Sep 17 00:00:00 2001 From: thesues Date: Fri, 2 May 2025 06:25:36 +0000 Subject: [PATCH 04/12] refactory, test pass --- src/infinistore.cpp | 345 +++++++++++++++++++++++------------------ src/infinistore.h | 16 +- src/libinfinistore.cpp | 12 +- src/libinfinistore.h | 4 +- src/rdma.cpp | 47 ++++-- src/rdma.h | 13 +- 6 files changed, 244 insertions(+), 193 deletions(-) diff --git a/src/infinistore.cpp b/src/infinistore.cpp index 02efe65..3f461dc 100644 --- a/src/infinistore.cpp +++ b/src/infinistore.cpp @@ -28,15 +28,17 @@ server_config_t global_config; uv_loop_t *loop; uv_tcp_t server; // global ibv context -struct ibv_context *ib_ctx; -struct ibv_pd *pd; +// struct ibv_context *ib_ctx; +// struct ibv_pd *pd; MM *mm; -int gidx = 0; -int lid = -1; -uint8_t ib_port = -1; -// local active_mtu attr, after exchanging with remote, we will use the min of the two for path.mtu -ibv_mtu active_mtu; +// int gidx = 0; +// int lid = -1; +// uint8_t ib_port = -1; +// // local active_mtu attr, after exchanging with remote, we will use the min of the two for +// path.mtu ibv_mtu active_mtu; + +struct rdma_device rdma_dev; // indicate if the MM extend is in flight bool extend_in_flight = false; @@ -84,13 +86,15 @@ struct Client { rdma_conn_info_t remote_info_; rdma_conn_info_t local_info_; - struct ibv_cq *cq_ = NULL; - struct ibv_qp *qp_ = NULL; + // struct ibv_cq *cq_ = NULL; + // struct ibv_qp *qp_ = NULL; bool rdma_connected_ = false; - struct ibv_comp_channel *comp_channel_ = NULL; + // struct ibv_comp_channel *comp_channel_ = NULL; + + rdma_context rdma_ctx_; - // notify thread new request - uv_sem_t sem_; + // // notify thread new request + // uv_sem_t sem_; uv_poll_t poll_handle_; @@ -165,27 +169,28 @@ Client::~Client() { tcp_recv_buffer_ = NULL; } - if (qp_) { - struct ibv_qp_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RESET; - if (ibv_modify_qp(qp_, &attr, IBV_QP_STATE)) { - ERROR("Failed to modify QP to ERR state"); - } - } - if (qp_) { - ibv_destroy_qp(qp_); - qp_ = NULL; - } - if (cq_) { - ibv_destroy_cq(cq_); - cq_ = NULL; - } - - if (comp_channel_) { - ibv_destroy_comp_channel(comp_channel_); - comp_channel_ = NULL; - } + destroy_rdma_context(&rdma_ctx_); + // if (qp_) { + // struct ibv_qp_attr attr; + // memset(&attr, 0, sizeof(attr)); + // attr.qp_state = IBV_QPS_RESET; + // if (ibv_modify_qp(qp_, &attr, IBV_QP_STATE)) { + // ERROR("Failed to modify QP to ERR state"); + // } + // } + // if (qp_) { + // ibv_destroy_qp(qp_); + // qp_ = NULL; + // } + // if (cq_) { + // ibv_destroy_cq(cq_); + // cq_ = NULL; + // } + + // if (comp_channel_) { + // ibv_destroy_comp_channel(comp_channel_); + // comp_channel_ = NULL; + // } } void on_close(uv_handle_t *handle) { @@ -337,7 +342,7 @@ void Client::post_ack(int return_code) { wr.sg_list = NULL; wr.num_sge = 0; wr.next = NULL; - int ret = ibv_post_send(qp_, &wr, &bad_wr); + int ret = ibv_post_send(rdma_ctx_.qp, &wr, &bad_wr); if (ret) { ERROR("Failed to send WITH_IMM message: {}", strerror(ret)); } @@ -352,7 +357,7 @@ void Client::cq_poll_handle(uv_poll_t *handle, int status, int events) { struct ibv_cq *cq; void *cq_context; - if (ibv_get_cq_event(comp_channel_, &cq, &cq_context) != 0) { + if (ibv_get_cq_event(rdma_ctx_.comp_channel, &cq, &cq_context) != 0) { ERROR("Failed to get CQ event"); return; } @@ -421,7 +426,7 @@ void Client::cq_poll_handle(uv_poll_t *handle, int status, int events) { struct ibv_sge *sges = item.second; ibv_send_wr *bad_wr = nullptr; DEBUG("IBV POST SEND, wr_id: {}", wrs[0].wr_id); - int ret = ibv_post_send(qp_, &wrs[0], &bad_wr); + int ret = ibv_post_send(rdma_ctx_.qp, &wrs[0], &bad_wr); if (ret) { ERROR("Failed to post RDMA write {}", strerror(ret)); throw std::runtime_error("Failed to post RDMA write"); @@ -464,7 +469,7 @@ void Client::cq_poll_handle(uv_poll_t *handle, int status, int events) { } } -void add_mempool(uv_work_t *req) { mm->add_mempool(pd); } +void add_mempool(uv_work_t *req) { mm->add_mempool(rdma_dev.pd); } void add_mempool_completion(uv_work_t *req, int status) { extend_in_flight = false; @@ -493,7 +498,7 @@ int Client::prepare_recv_rdma_request(int buf_idx) { rwr.next = NULL; rwr.sg_list = &sge; rwr.num_sge = 1; - if (ibv_post_recv(qp_, &rwr, &bad_wr)) { + if (ibv_post_recv(rdma_ctx_.qp, &rwr, &bad_wr)) { ERROR("Failed to post receive, {}"); return -1; } @@ -553,7 +558,7 @@ void Client::perform_batch_rdma(const RemoteMetaRequest *remote_meta_req, if (num_wr == max_wr || i == remote_meta_req->keys()->size() - 1) { if (!wr_full) { struct ibv_send_wr *bad_wr = nullptr; - int ret = ibv_post_send(qp_, &wrs[0], &bad_wr); + int ret = ibv_post_send(rdma_ctx_.qp, &wrs[0], &bad_wr); if (ret) { ERROR("Failed to post RDMA write {}", strerror(ret)); return; @@ -699,6 +704,7 @@ void on_write(uv_write_t *req, int status) { free(req); } +/* int init_rdma_context(server_config_t config) { struct ibv_device **dev_list; struct ibv_device *ib_dev; @@ -764,6 +770,7 @@ int init_rdma_context(server_config_t config) { return 0; } +*/ int Client::rdma_exchange() { INFO("do rdma exchange..."); @@ -775,128 +782,150 @@ int Client::rdma_exchange() { return SYSTEM_ERROR; } - comp_channel_ = ibv_create_comp_channel(ib_ctx); - if (!comp_channel_) { - ERROR("Failed to create completion channel"); - return -1; - } - - // RDMA setup if not already done - assert(comp_channel_ != NULL); - - cq_ = ibv_create_cq(ib_ctx, MAX_SEND_WR + MAX_RECV_WR, NULL, comp_channel_, 0); - if (!cq_) { - ERROR("Failed to create CQ"); - return SYSTEM_ERROR; - } - - // Create Queue Pair - struct ibv_qp_init_attr qp_init_attr = {}; - qp_init_attr.send_cq = cq_; - qp_init_attr.recv_cq = cq_; - qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection - qp_init_attr.cap.max_send_wr = MAX_SEND_WR; - qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; - qp_init_attr.cap.max_send_sge = 1; - qp_init_attr.cap.max_recv_sge = 1; - - qp_ = ibv_create_qp(pd, &qp_init_attr); - if (!qp_) { - ERROR("Failed to create QP"); - return SYSTEM_ERROR; - } - // Modify QP to INIT state - struct ibv_qp_attr attr = {}; - attr.qp_state = IBV_QPS_INIT; - attr.port_num = ib_port; - attr.pkey_index = 0; - attr.qp_access_flags = - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; - - int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; - - ret = ibv_modify_qp(qp_, &attr, flags); - if (ret) { - ERROR("Failed to modify QP to INIT"); - return SYSTEM_ERROR; - } - - union ibv_gid gid; - // get gid - if (gidx != -1 && ibv_query_gid(ib_ctx, 1, gidx, &gid)) { - ERROR("Failed to get GID"); - return SYSTEM_ERROR; - } + // create rdma context per connection. + // use the global rdma_dev + init_rdma_context(&rdma_ctx_, &rdma_dev); - local_info_.qpn = qp_->qp_num; - local_info_.psn = lrand48() & 0xffffff; - local_info_.gid = gid; - local_info_.lid = lid; - local_info_.mtu = (uint32_t)active_mtu; + local_info_ = get_rdma_conn_info(&rdma_ctx_, &rdma_dev); - INFO("gid index: {}", gidx); print_rdma_conn_info(&local_info_, false); print_rdma_conn_info(&remote_info_, true); - // update MTU - if (remote_info_.mtu != (uint32_t)active_mtu) { - WARN("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", - 1 << ((uint32_t)remote_info_.mtu + 7), 1 << ((uint32_t)active_mtu + 7)); - } - // Modify QP to RTR state - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = (enum ibv_mtu)std::min((uint32_t)active_mtu, (uint32_t)remote_info_.mtu); - attr.dest_qp_num = remote_info_.qpn; - attr.rq_psn = remote_info_.psn; - attr.max_dest_rd_atomic = 16; - attr.min_rnr_timer = 12; - attr.ah_attr.dlid = 0; // RoCE v2 is used. - attr.ah_attr.sl = 0; - attr.ah_attr.src_path_bits = 0; - attr.ah_attr.port_num = ib_port; - - if (gidx == -1) { - // IB - attr.ah_attr.dlid = remote_info_.lid; - attr.ah_attr.is_global = 0; - } - else { - // RoCE v2 - attr.ah_attr.is_global = 1; - attr.ah_attr.grh.dgid = remote_info_.gid; - attr.ah_attr.grh.sgid_index = gidx; - attr.ah_attr.grh.hop_limit = 1; + if (modify_qp_to_rtr(&rdma_ctx_, &rdma_dev, &remote_info_)) { + ERROR("Failed to modify QP to RTR"); + return -1; } - flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; - - ret = ibv_modify_qp(qp_, &attr, flags); - if (ret) { - ERROR("Failed to modify QP to RTR: reason: {}", strerror(ret)); - return SYSTEM_ERROR; + if (modify_qp_to_rts(&rdma_ctx_)) { + ERROR("Failed to modify QP to RTS"); + return -1; } - // Modify QP to RTS state - memset(&attr, 0, sizeof(attr)); - attr.qp_state = IBV_QPS_RTS; - attr.timeout = 14; - attr.retry_cnt = 7; - attr.rnr_retry = 7; - attr.sq_psn = local_info_.psn; - attr.max_rd_atomic = 16; + // comp_channel_ = ibv_create_comp_channel(ib_ctx); + // if (!comp_channel_) { + // ERROR("Failed to create completion channel"); + // return -1; + // } + + // // RDMA setup if not already done + // assert(comp_channel_ != NULL); + + // cq_ = ibv_create_cq(ib_ctx, MAX_SEND_WR + MAX_RECV_WR, NULL, comp_channel_, 0); + // if (!cq_) { + // ERROR("Failed to create CQ"); + // return SYSTEM_ERROR; + // } + + // // Create Queue Pair + // struct ibv_qp_init_attr qp_init_attr = {}; + // qp_init_attr.send_cq = cq_; + // qp_init_attr.recv_cq = cq_; + // qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection + // qp_init_attr.cap.max_send_wr = MAX_SEND_WR; + // qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; + // qp_init_attr.cap.max_send_sge = 1; + // qp_init_attr.cap.max_recv_sge = 1; + + // qp_ = ibv_create_qp(pd, &qp_init_attr); + // if (!qp_) { + // ERROR("Failed to create QP"); + // return SYSTEM_ERROR; + // } + // // Modify QP to INIT state + // struct ibv_qp_attr attr = {}; + // attr.qp_state = IBV_QPS_INIT; + // attr.port_num = ib_port; + // attr.pkey_index = 0; + // attr.qp_access_flags = + // IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; + + // int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; + + // ret = ibv_modify_qp(qp_, &attr, flags); + // if (ret) { + // ERROR("Failed to modify QP to INIT"); + // return SYSTEM_ERROR; + // } + + // union ibv_gid gid; + // // get gid + // if (gidx != -1 && ibv_query_gid(ib_ctx, 1, gidx, &gid)) { + // ERROR("Failed to get GID"); + // return SYSTEM_ERROR; + // } + + // local_info_.qpn = qp_->qp_num; + // local_info_.psn = lrand48() & 0xffffff; + // local_info_.gid = gid; + // local_info_.lid = lid; + // local_info_.mtu = (uint32_t)active_mtu; + + // INFO("gid index: {}", gidx); + // print_rdma_conn_info(&local_info_, false); + // print_rdma_conn_info(&remote_info_, true); + + // // update MTU + // if (remote_info_.mtu != (uint32_t)active_mtu) { + // WARN("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", + // 1 << ((uint32_t)remote_info_.mtu + 7), 1 << ((uint32_t)active_mtu + 7)); + // } + + // // Modify QP to RTR state + // memset(&attr, 0, sizeof(attr)); + // attr.qp_state = IBV_QPS_RTR; + // attr.path_mtu = (enum ibv_mtu)std::min((uint32_t)active_mtu, (uint32_t)remote_info_.mtu); + // attr.dest_qp_num = remote_info_.qpn; + // attr.rq_psn = remote_info_.psn; + // attr.max_dest_rd_atomic = 16; + // attr.min_rnr_timer = 12; + // attr.ah_attr.dlid = 0; // RoCE v2 is used. + // attr.ah_attr.sl = 0; + // attr.ah_attr.src_path_bits = 0; + // attr.ah_attr.port_num = ib_port; + + // if (gidx == -1) { + // // IB + // attr.ah_attr.dlid = remote_info_.lid; + // attr.ah_attr.is_global = 0; + // } + // else { + // // RoCE v2 + // attr.ah_attr.is_global = 1; + // attr.ah_attr.grh.dgid = remote_info_.gid; + // attr.ah_attr.grh.sgid_index = gidx; + // attr.ah_attr.grh.hop_limit = 1; + // } + + // flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | + // IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; + + // ret = ibv_modify_qp(qp_, &attr, flags); + // if (ret) { + // ERROR("Failed to modify QP to RTR: reason: {}", strerror(ret)); + // return SYSTEM_ERROR; + // } + + // // Modify QP to RTS state + // memset(&attr, 0, sizeof(attr)); + // attr.qp_state = IBV_QPS_RTS; + // attr.timeout = 14; + // attr.retry_cnt = 7; + // attr.rnr_retry = 7; + // attr.sq_psn = local_info_.psn; + // attr.max_rd_atomic = 16; + + // flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | + // IBV_QP_MAX_QP_RD_ATOMIC; + + // ret = ibv_modify_qp(qp_, &attr, flags); + // if (ret) { + // ERROR("Failed to modify QP to RTS"); + // return SYSTEM_ERROR; + // } - flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC; - - ret = ibv_modify_qp(qp_, &attr, flags); - if (ret) { - ERROR("Failed to modify QP to RTS"); - return SYSTEM_ERROR; - } INFO("RDMA exchange done"); + rdma_connected_ = true; if (posix_memalign((void **)&send_buffer_, 4096, PROTOCOL_BUFFER_SIZE) != 0) { @@ -904,7 +933,7 @@ int Client::rdma_exchange() { return SYSTEM_ERROR; } - send_mr_ = ibv_reg_mr(pd, send_buffer_, PROTOCOL_BUFFER_SIZE, IBV_ACCESS_LOCAL_WRITE); + send_mr_ = ibv_reg_mr(rdma_dev.pd, send_buffer_, PROTOCOL_BUFFER_SIZE, IBV_ACCESS_LOCAL_WRITE); if (!send_mr_) { ERROR("Failed to register MR"); return SYSTEM_ERROR; @@ -916,7 +945,8 @@ int Client::rdma_exchange() { return SYSTEM_ERROR; } - recv_mr_[i] = ibv_reg_mr(pd, recv_buffer_[i], PROTOCOL_BUFFER_SIZE, IBV_ACCESS_LOCAL_WRITE); + recv_mr_[i] = + ibv_reg_mr(rdma_dev.pd, recv_buffer_[i], PROTOCOL_BUFFER_SIZE, IBV_ACCESS_LOCAL_WRITE); if (!recv_mr_[i]) { ERROR("Failed to register MR"); return SYSTEM_ERROR; @@ -928,12 +958,12 @@ int Client::rdma_exchange() { } } - if (ibv_req_notify_cq(cq_, 0)) { + if (ibv_req_notify_cq(rdma_ctx_.cq, 0)) { ERROR("Failed to request notify for CQ"); return SYSTEM_ERROR; } - uv_poll_init(loop, &poll_handle_, comp_channel_->fd); + uv_poll_init(loop, &poll_handle_, rdma_ctx_.comp_channel->fd); poll_handle_.data = this; uv_poll_start(&poll_handle_, UV_READABLE | UV_WRITABLE, [](uv_poll_t *handle, int status, int events) { @@ -1209,10 +1239,15 @@ int register_server(unsigned long loop_ptr, server_config_t config) { return -1; } - if (init_rdma_context(config) < 0) { + // if (init_rdma_context(config) < 0) { + // return -1; + // } + + if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, &rdma_dev) < 0) { + ERROR("Failed to open RDMA device"); return -1; } - mm = new MM(config.prealloc_size << 30, config.minimal_allocate_size << 10, pd); + mm = new MM(config.prealloc_size << 30, config.minimal_allocate_size << 10, rdma_dev.pd); INFO("register server done"); diff --git a/src/infinistore.h b/src/infinistore.h index c7fd476..f1453e8 100644 --- a/src/infinistore.h +++ b/src/infinistore.h @@ -7,21 +7,23 @@ #include "config.h" #include "log.h" #include "mempool.h" +#include "rdma.h" #include "utils.h" extern server_config_t global_config; extern uv_loop_t *loop; extern uv_tcp_t server; +extern struct rdma_device rdma_dev; // global ibv context -extern struct ibv_context *ib_ctx; -extern struct ibv_pd *pd; +// extern struct ibv_context *ib_ctx; +// extern struct ibv_pd *pd; extern MM *mm; -extern int gidx; -extern int lid; -extern uint8_t ib_port; -// local active_mtu attr, after exchanging with remote, we will use the min of the two for path.mtu -extern ibv_mtu active_mtu; +// extern int gidx; +// extern int lid; +// extern uint8_t ib_port; +// // local active_mtu attr, after exchanging with remote, we will use the min of the two for +// path.mtu extern ibv_mtu active_mtu; // indicate if the MM extend is in flight extern bool extend_in_flight; diff --git a/src/libinfinistore.cpp b/src/libinfinistore.cpp index 06b0c81..15566da 100644 --- a/src/libinfinistore.cpp +++ b/src/libinfinistore.cpp @@ -393,11 +393,11 @@ int Connection::setup_rdma(client_config_t config) { return -1; } - print_rdma_conn_info(&ctx_.remote_info, true); - print_rdma_conn_info(&ctx_.local_info, false); + print_rdma_conn_info(&remote_info_, true); + print_rdma_conn_info(&local_info_, false); // Modify QP to RTR state - if (modify_qp_to_rtr(&ctx_, &rdma_dev_)) { + if (modify_qp_to_rtr(&ctx_, &rdma_dev_, &remote_info_)) { ERROR("Failed to modify QP to RTR"); return -1; } @@ -525,9 +525,11 @@ int Connection::exchange_conn_info() { struct iovec iov[2]; struct msghdr msg; + local_info_ = get_rdma_conn_info(&ctx_, &rdma_dev_); + iov[0].iov_base = &header; iov[0].iov_len = FIXED_HEADER_SIZE; - iov[1].iov_base = &ctx_.local_info; + iov[1].iov_base = &local_info_; iov[1].iov_len = sizeof(rdma_conn_info_t); memset(&msg, 0, sizeof(msg)); @@ -550,7 +552,7 @@ int Connection::exchange_conn_info() { return -1; } - if (recv(sock_, &ctx_.remote_info, sizeof(rdma_conn_info_t), MSG_WAITALL) != + if (recv(sock_, &remote_info_, sizeof(rdma_conn_info_t), MSG_WAITALL) != sizeof(rdma_conn_info_t)) { ERROR("Failed to receive remote connection information"); return -1; diff --git a/src/libinfinistore.h b/src/libinfinistore.h index 193282c..c2fd612 100644 --- a/src/libinfinistore.h +++ b/src/libinfinistore.h @@ -80,8 +80,8 @@ class Connection { struct rdma_device rdma_dev_; struct rdma_context ctx_; - // rdma_conn_info_t local_info_; - // rdma_conn_info_t remote_info_; + rdma_conn_info_t local_info_; + rdma_conn_info_t remote_info_; std::unordered_map local_mr_; diff --git a/src/rdma.cpp b/src/rdma.cpp index 9c0f016..4e5cbb2 100644 --- a/src/rdma.cpp +++ b/src/rdma.cpp @@ -162,17 +162,31 @@ int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev) { } // save information to local_info for exchange data - ctx->local_info.qpn = ctx->qp->qp_num; - ctx->local_info.psn = lrand48() & 0xffffff; - if (rdma_dev->gid_index != -1) { - ctx->local_info.gid = rdma_dev->gid; - } + // ctx->local_info.qpn = ctx->qp->qp_num; + // ctx->local_info.psn = lrand48() & 0xffffff; + // if (rdma_dev->gid_index != -1) { + // ctx->local_info.gid = rdma_dev->gid; + // } + + // ctx->local_info.lid = rdma_dev->lid; + // ctx->local_info.mtu = (uint32_t)rdma_dev->active_mtu; + ctx->psn = lrand48() & 0xffffff; - ctx->local_info.lid = rdma_dev->lid; - ctx->local_info.mtu = (uint32_t)rdma_dev->active_mtu; return 0; } +rdma_conn_info_t get_rdma_conn_info(struct rdma_context *ctx, struct rdma_device *rdma_dev) { + assert(ctx != NULL); + rdma_conn_info_t conn_info = { + .qpn = ctx->qp->qp_num, + .psn = ctx->psn, + .gid = rdma_dev->gid, + .lid = rdma_dev->lid, + .mtu = (uint32_t)rdma_dev->active_mtu, + }; + return conn_info; +} + int modify_qp_to_init(struct rdma_context *ctx, struct rdma_device *rdma_dev) { assert(ctx != NULL); assert(rdma_dev != NULL); @@ -202,7 +216,7 @@ int modify_qp_to_rts(struct rdma_context *ctx) { attr.timeout = 14; attr.retry_cnt = 7; attr.rnr_retry = 7; - attr.sq_psn = ctx->local_info.psn; // Use 0 or match with local PSN + attr.sq_psn = ctx->psn; // Use 0 or match with local PSN attr.max_rd_atomic = 16; int flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | @@ -216,7 +230,8 @@ int modify_qp_to_rts(struct rdma_context *ctx) { return 0; } -int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev) { +int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev, + rdma_conn_info_t *remote_info) { assert(ctx != NULL); assert(rdma_dev != NULL); @@ -224,16 +239,16 @@ int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev) { attr.qp_state = IBV_QPS_RTR; // update MTU - if (ctx->remote_info.mtu != (uint32_t)rdma_dev->active_mtu) { + if (remote_info->mtu != (uint32_t)rdma_dev->active_mtu) { INFO("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", - 1 << ((uint32_t)ctx->remote_info.mtu + 7), 1 << ((uint32_t)rdma_dev->active_mtu + 7)); + 1 << ((uint32_t)remote_info->mtu + 7), 1 << ((uint32_t)rdma_dev->active_mtu + 7)); } attr.path_mtu = - (enum ibv_mtu)std::min((uint32_t)rdma_dev->active_mtu, (uint32_t)ctx->remote_info.mtu); + (enum ibv_mtu)std::min((uint32_t)rdma_dev->active_mtu, (uint32_t)remote_info->mtu); - attr.dest_qp_num = ctx->remote_info.qpn; - attr.rq_psn = ctx->remote_info.psn; + attr.dest_qp_num = remote_info->qpn; + attr.rq_psn = remote_info->psn; attr.max_dest_rd_atomic = 16; attr.min_rnr_timer = 12; attr.ah_attr.dlid = 0; @@ -243,13 +258,13 @@ int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev) { if (rdma_dev->gid_index == -1) { // IB - attr.ah_attr.dlid = ctx->remote_info.lid; + attr.ah_attr.dlid = remote_info->lid; attr.ah_attr.is_global = 0; } else { // RoCE v2 attr.ah_attr.is_global = 1; - attr.ah_attr.grh.dgid = ctx->remote_info.gid; + attr.ah_attr.grh.dgid = remote_info->gid; attr.ah_attr.grh.sgid_index = rdma_dev->gid_index; // local gid attr.ah_attr.grh.hop_limit = 1; } diff --git a/src/rdma.h b/src/rdma.h index 5fb1647..cea8495 100644 --- a/src/rdma.h +++ b/src/rdma.h @@ -25,21 +25,18 @@ struct rdma_context { struct ibv_comp_channel *comp_channel; struct ibv_cq *cq; struct ibv_qp *qp; - - struct rdma_conn_info_t local_info; - struct rdma_conn_info_t remote_info; - rdma_context() : comp_channel(nullptr), cq(nullptr), qp(nullptr) { - memset(&local_info, 0, sizeof(local_info)); - memset(&remote_info, 0, sizeof(remote_info)); - } + uint32_t psn; // local PSN, randomly generated when creating QP + rdma_context() : comp_channel(nullptr), cq(nullptr), qp(nullptr) {} }; int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, struct rdma_device *rdma_dev); +rdma_conn_info_t get_rdma_conn_info(struct rdma_context *ctx, struct rdma_device *rdma_dev); int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev); int modify_qp_to_init(struct rdma_context *ctx, struct rdma_device *rdma_dev); int modify_qp_to_rts(struct rdma_context *ctx); -int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev); +int modify_qp_to_rtr(struct rdma_context *ctx, struct rdma_device *rdma_dev, + rdma_conn_info_t *remote_info); int destroy_rdma_context(struct rdma_context *ctx); int close_rdma_device(struct rdma_device *rdma_dev); From 0343a0c1f46d18b7a62fd56f788988dc843c7ec0 Mon Sep 17 00:00:00 2001 From: thesues Date: Fri, 2 May 2025 22:27:45 +0000 Subject: [PATCH 05/12] add new parameter --hint-gid-index for server/client --- infinistore/example/client_async_single.py | 3 +- infinistore/example/demo_prefill.py | 102 --------------------- infinistore/lib.py | 7 +- infinistore/server.py | 20 ++-- src/config.h | 2 + src/infinistore.cpp | 6 +- src/libinfinistore.cpp | 3 +- src/pybind.cpp | 6 +- src/rdma.cpp | 42 ++++++--- src/rdma.h | 4 +- src/utils.cpp | 13 ++- src/utils.h | 1 + 12 files changed, 66 insertions(+), 143 deletions(-) delete mode 100644 infinistore/example/demo_prefill.py diff --git a/infinistore/example/client_async_single.py b/infinistore/example/client_async_single.py index 61c8796..955faf7 100644 --- a/infinistore/example/client_async_single.py +++ b/infinistore/example/client_async_single.py @@ -12,11 +12,12 @@ def generate_uuid(): config = infinistore.ClientConfig( host_addr="127.0.0.1", service_port=12345, - log_level="warning", + log_level="info", connection_type=infinistore.TYPE_RDMA, ib_port=1, link_type=infinistore.LINK_ETHERNET, dev_name="mlx5_0", + hint_gid_index=6, ) diff --git a/infinistore/example/demo_prefill.py b/infinistore/example/demo_prefill.py deleted file mode 100644 index 2f8d627..0000000 --- a/infinistore/example/demo_prefill.py +++ /dev/null @@ -1,102 +0,0 @@ -from infinistore import ( - ClientConfig, - InfinityConnection, -) -import infinistore -import torch -import time -import torch.nn as nn - -import queue -import threading - -N = 4096 -num_layers = 14 -num_heads = 8 -seq_length = 5000 - - -class TransformerLayer(nn.Module): - def __init__(self, embed_dim, num_heads): - super().__init__() - self.mha = nn.MultiheadAttention( - embed_dim, num_heads, device="cuda:0", dtype=torch.float16 - ) - self.norm = nn.LayerNorm(embed_dim, device="cuda:0", dtype=torch.float16) - self.ffn = nn.Sequential( - nn.Linear(embed_dim, embed_dim, device="cuda:0", dtype=torch.float16), - nn.ReLU(), - nn.Linear(embed_dim, embed_dim, device="cuda:0", dtype=torch.float16), - ) - - def forward(self, x): - attn_output, _ = self.mha(x, x, x) - x = self.norm(x + attn_output) - - ffn_output = self.ffn(x) - x = self.norm(x + ffn_output) - return x - - -def run(conn): - model = nn.Sequential( - *[TransformerLayer(N, num_heads) for _ in range(num_layers)] - ).cuda() - - input = torch.randn(seq_length, 1, N, device="cuda:0", dtype=torch.float16) - - torch.cuda.synchronize(0) - now1 = time.time() - - output = input - - upload_queue = queue.Queue() - - def upload_worker(): - while True: - layer_idx, event, data = upload_queue.get() - event.synchronize() - blocks = [(f"key{i}_{j}", j * 4096) for j in range(5000)] - conn.local_gpu_write_cache(output, blocks, 4096) - upload_queue.task_done() - - upload_thread = threading.Thread(target=upload_worker, daemon=True) - upload_thread.start() - events = [torch.cuda.Event() for _ in range(len(model))] - # outputs = [] - - for i, layer in enumerate(model): - output = layer(output) - events[i].record() - # old approach - # outputs.append(output) - upload_queue.put((i, events[i], output)) - - # Old approach: upload kvcache after computation - - # torch.cuda.synchronize() - # for i, output in enumerate(outputs): - # # split output into 1000 blocks - # blocks = [(f"key{i}_{j}", j*4096) for j in range(5000)] - # conn.local_gpu_write_cache(output, blocks, 4096) - - conn.sync() - - print("Time taken for linear layers: ", time.time() - now1) - - -if __name__ == "__main__": - config = ClientConfig( - host_addr="127.0.0.1", - service_port=12345, - log_level="debug", - connection_type=infinistore.TYPE_RDMA, - ib_port=1, - link_type=infinistore.LINK_ETHERNET, - dev_name="mlx5_0", - ) - - config.connection_type = infinistore.TYPE_LOCAL_GPU - local_conn = InfinityConnection(config) - local_conn.connect() - run(local_conn) diff --git a/infinistore/lib.py b/infinistore/lib.py index 24e2e79..39d6c02 100644 --- a/infinistore/lib.py +++ b/infinistore/lib.py @@ -62,6 +62,7 @@ def __init__(self, **kwargs): self.log_level = os.environ["INFINISTORE_LOG_LEVEL"] else: self.log_level = kwargs.get("log_level", "warning") + self.hint_gid_index = kwargs.get("hint_gid_index", -1) def __repr__(self): return ( @@ -104,6 +105,8 @@ class ServerConfig: prealloc_size (int): The preallocation size. Defaults to 16. minimal_allocate_size (int): The minimal allocation size. Defaults to 64. auto_increase (bool): indicate if infinistore will be automatically increased. 10GB each time. Default False. + hint_gid_index (int): The hint GID index. Defaults to -1. + """ def __init__(self, **kwargs): @@ -120,6 +123,7 @@ def __init__(self, **kwargs): self.evict_min_threshold = kwargs.get("evict_min_threshold", 0.6) self.evict_max_threshold = kwargs.get("evict_max_threshold", 0.8) self.evict_interval = kwargs.get("evict_interval", 5) + self.hint_gid_index = kwargs.get("hint_gid_index", -1) def __repr__(self): return ( @@ -128,7 +132,8 @@ def __repr__(self): f"dev_name='{self.dev_name}', ib_port={self.ib_port}, link_type='{self.link_type}', " f"prealloc_size={self.prealloc_size}, minimal_allocate_size={self.minimal_allocate_size}, " f"auto_increase={self.auto_increase}, evict_min_threshold={self.evict_min_threshold}, " - f"evict_max_threshold={self.evict_max_threshold}, evict_interval={self.evict_interval}" + f"evict_max_threshold={self.evict_max_threshold}, evict_interval={self.evict_interval}, " + f"hint_gid_index={self.hint_gid_index}" ) def verify(self): diff --git a/infinistore/server.py b/infinistore/server.py index e97d866..5c39bdd 100644 --- a/infinistore/server.py +++ b/infinistore/server.py @@ -137,6 +137,13 @@ def parse_args(): default=False, help="enable evict cache, default False", ) + parser.add_argument( + "--hint-gid-index", + required=False, + default=-1, + help="hint gid index, default 1, -1 means no hint", + type=int, + ) return parser.parse_args() @@ -156,18 +163,7 @@ async def periodic_evict(min_threshold: float, max_threshold: float, interval: i def main(): args = parse_args() config = ServerConfig( - manage_port=args.manage_port, - service_port=args.service_port, - log_level=args.log_level, - prealloc_size=args.prealloc_size, - dev_name=args.dev_name, - ib_port=args.ib_port, - link_type=args.link_type, - minimal_allocate_size=args.minimal_allocate_size, - auto_increase=args.auto_increase, - evict_interval=args.evict_interval, - evict_min_threshold=args.evict_min_threshold, - evict_max_threshold=args.evict_max_threshold, + **vars(args), ) config.verify() diff --git a/src/config.h b/src/config.h index dc7942c..5d34ae7 100644 --- a/src/config.h +++ b/src/config.h @@ -19,6 +19,7 @@ typedef struct ServerConfig { std::string link_type; int minimal_allocate_size; // unit: KB bool auto_increase; + int hint_gid_index; } server_config_t; typedef struct ClientConfig { @@ -28,6 +29,7 @@ typedef struct ClientConfig { std::string host_addr; int ib_port; std::string link_type; + int hint_gid_index; } client_config_t; #endif diff --git a/src/infinistore.cpp b/src/infinistore.cpp index 3f461dc..6e29bc5 100644 --- a/src/infinistore.cpp +++ b/src/infinistore.cpp @@ -1242,8 +1242,10 @@ int register_server(unsigned long loop_ptr, server_config_t config) { // if (init_rdma_context(config) < 0) { // return -1; // } - - if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, &rdma_dev) < 0) { + INFO("open rdma device {}, link_type {}, hint_gid_index {},", config.dev_name, config.link_type, + config.hint_gid_index); + if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, config.hint_gid_index, + &rdma_dev) < 0) { ERROR("Failed to open RDMA device"); return -1; } diff --git a/src/libinfinistore.cpp b/src/libinfinistore.cpp index 15566da..36f042e 100644 --- a/src/libinfinistore.cpp +++ b/src/libinfinistore.cpp @@ -378,7 +378,8 @@ int Connection::setup_rdma(client_config_t config) { // return -1; // } - if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, &rdma_dev_) < 0) { + if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, config.hint_gid_index, + &rdma_dev_) < 0) { ERROR("Failed to open RDMA device"); return -1; } diff --git a/src/pybind.cpp b/src/pybind.cpp index 5a9c627..205d7b3 100644 --- a/src/pybind.cpp +++ b/src/pybind.cpp @@ -42,7 +42,8 @@ PYBIND11_MODULE(_infinistore, m) { .def_readwrite("dev_name", &client_config_t::dev_name) .def_readwrite("ib_port", &client_config_t::ib_port) .def_readwrite("link_type", &client_config_t::link_type) - .def_readwrite("host_addr", &client_config_t::host_addr); + .def_readwrite("host_addr", &client_config_t::host_addr) + .def_readwrite("hint_gid_index", &client_config_t::hint_gid_index); py::class_>(m, "Connection") .def(py::init<>()) @@ -106,7 +107,8 @@ PYBIND11_MODULE(_infinistore, m) { .def_readwrite("link_type", &ServerConfig::link_type) .def_readwrite("prealloc_size", &ServerConfig::prealloc_size) .def_readwrite("minimal_allocate_size", &ServerConfig::minimal_allocate_size) - .def_readwrite("auto_increase", &ServerConfig::auto_increase); + .def_readwrite("auto_increase", &ServerConfig::auto_increase) + .def_readwrite("hint_gid_index", &ServerConfig::hint_gid_index); m.def( "purge_kv_map", []() { kv_map.clear(); }, "purge kv map"); m.def( diff --git a/src/rdma.cpp b/src/rdma.cpp index 4e5cbb2..b8dccf2 100644 --- a/src/rdma.cpp +++ b/src/rdma.cpp @@ -3,6 +3,7 @@ #include #include "log.h" +#include "utils.h" int close_rdma_device(struct rdma_device *rdma_dev) { assert(rdma_dev != NULL); @@ -35,7 +36,7 @@ int destroy_rdma_context(struct rdma_context *ctx) { return 0; } -int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, +int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, int hint_gid_index, struct rdma_device *rdma_dev) { assert(link_type == "IB" || link_type == "Ethernet"); assert(rdma_dev != NULL); @@ -87,27 +88,42 @@ int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, } if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { + // IB rdma_dev->gid_index = -1; + rdma_dev->lid = port_attr.lid; + INFO("IB lid {}", rdma_dev->lid); } else { - rdma_dev->gid_index = - ibv_find_sgid_type(rdma_dev->ib_ctx, rdma_dev->ib_port, IBV_GID_TYPE_ROCE_V2, AF_INET); - if (rdma_dev->gid_index < 0) { - ERROR("Failed to find GID index"); + // RoCE v2 + if (hint_gid_index >= 0) { + rdma_dev->gid_index = hint_gid_index; + WARN("RoCE choose user specified gid index {}", rdma_dev->gid_index); + } + else { + rdma_dev->gid_index = ibv_find_sgid_type(rdma_dev->ib_ctx, rdma_dev->ib_port, + IBV_GID_TYPE_ROCE_V2, AF_INET); + if (rdma_dev->gid_index < 0) { + ERROR("Failed to find GID index"); + return -1; + } + } + + if (ibv_query_gid(rdma_dev->ib_ctx, 1, rdma_dev->gid_index, &rdma_dev->gid) < 0) { + ERROR("Failed to get GID from index {}", rdma_dev->gid_index); return -1; } - } - rdma_dev->lid = port_attr.lid; - rdma_dev->active_mtu = port_attr.active_mtu; + // if gid all all zero, return error + if (rdma_dev->gid.global.subnet_prefix == 0 && rdma_dev->gid.global.interface_id == 0) { + ERROR("GID is all zero"); + return -1; + } - // get gid - if (rdma_dev->gid_index != -1 && - ibv_query_gid(rdma_dev->ib_ctx, 1, rdma_dev->gid_index, &rdma_dev->gid)) { - ERROR("Failed to get GID"); - return -1; + INFO("gid index {}, gid {}", rdma_dev->gid_index, human_readable_gid(rdma_dev->gid)); } + rdma_dev->active_mtu = port_attr.active_mtu; + rdma_dev->pd = ibv_alloc_pd(rdma_dev->ib_ctx); if (!rdma_dev->pd) { ERROR("Failed to allocate PD"); diff --git a/src/rdma.h b/src/rdma.h index cea8495..40d603f 100644 --- a/src/rdma.h +++ b/src/rdma.h @@ -13,7 +13,7 @@ struct rdma_device { int ib_port; int gid_index; union ibv_gid gid; // RoCE v2 - int lid; + uint16_t lid; std::string link_type; // IB or Ethernet ibv_mtu active_mtu; rdma_device() : ib_ctx(nullptr), pd(nullptr), ib_port(-1), gid_index(-1), lid(-1) {} @@ -29,7 +29,7 @@ struct rdma_context { rdma_context() : comp_channel(nullptr), cq(nullptr), qp(nullptr) {} }; -int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, +int open_rdma_device(std::string dev_name, int ib_port, std::string link_type, int hint_gid_index, struct rdma_device *rdma_dev); rdma_conn_info_t get_rdma_conn_info(struct rdma_context *ctx, struct rdma_device *rdma_dev); int init_rdma_context(struct rdma_context *ctx, struct rdma_device *rdma_dev); diff --git a/src/utils.cpp b/src/utils.cpp index 8a3d623..d435747 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -45,33 +45,32 @@ int recv_exact(int socket, void* buffer, size_t length) { return 0; // Successfully received exactly `length` bytes } -std::string human_readable_gid(rdma_conn_info_t* info) { +std::string human_readable_gid(union ibv_gid gid) { std::string gid_str; bool is_ipv4_mapped = true; // Check if the GID is an IPv4-mapped IPv6 address for (int i = 0; i < 10; ++i) { - if (info->gid.raw[i] != 0) { + if (gid.raw[i] != 0) { is_ipv4_mapped = false; break; } } - if (info->gid.raw[10] != 0xff || info->gid.raw[11] != 0xff) { + if (gid.raw[10] != 0xff || gid.raw[11] != 0xff) { is_ipv4_mapped = false; } if (is_ipv4_mapped) { // Convert the last 4 bytes to an IPv4 address char ipv4_str[INET_ADDRSTRLEN]; - uint8_t ipv4_addr[4] = {info->gid.raw[12], info->gid.raw[13], info->gid.raw[14], - info->gid.raw[15]}; + uint8_t ipv4_addr[4] = {gid.raw[12], gid.raw[13], gid.raw[14], gid.raw[15]}; inet_ntop(AF_INET, ipv4_addr, ipv4_str, INET_ADDRSTRLEN); gid_str = ipv4_str; } else { // Convert the GID to a standard IPv6 address string for (int i = 0; i < 16; ++i) { - gid_str += fmt::format("{:02x}", static_cast(info->gid.raw[i])); + gid_str += fmt::format("{:02x}", static_cast(gid.raw[i])); if (i % 2 == 1 && i != 15) { gid_str += ":"; } @@ -81,7 +80,7 @@ std::string human_readable_gid(rdma_conn_info_t* info) { } void print_rdma_conn_info(rdma_conn_info_t* info, bool is_remote) { - std::string gid_str = human_readable_gid(info); + std::string gid_str = human_readable_gid(info->gid); if (is_remote) { DEBUG("remote rdma_conn_info: psn: {}, qpn: {}, gid: {}, enum mtu: {}", (uint32_t)info->psn, (uint32_t)info->qpn, gid_str, (uint32_t)info->mtu); diff --git a/src/utils.h b/src/utils.h index 96467c0..42a1020 100644 --- a/src/utils.h +++ b/src/utils.h @@ -12,6 +12,7 @@ int send_exact(int socket, const void *buffer, size_t length); int recv_exact(int socket, void *buffer, size_t length); +std::string human_readable_gid(union ibv_gid gid); void print_rdma_conn_info(rdma_conn_info_t *info, bool is_remote); void signal_handler(int signum); From 170d64b591461e407253a49bc5a22666bba68672 Mon Sep 17 00:00:00 2001 From: thesues Date: Tue, 6 May 2025 00:40:21 +0000 Subject: [PATCH 06/12] client could resolve hostname --- infinistore/example/client_async_single.py | 1 - infinistore/lib.py | 25 +++++++++++++++++++++- src/libinfinistore.cpp | 3 ++- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/infinistore/example/client_async_single.py b/infinistore/example/client_async_single.py index 955faf7..8493b1a 100644 --- a/infinistore/example/client_async_single.py +++ b/infinistore/example/client_async_single.py @@ -17,7 +17,6 @@ def generate_uuid(): ib_port=1, link_type=infinistore.LINK_ETHERNET, dev_name="mlx5_0", - hint_gid_index=6, ) diff --git a/infinistore/lib.py b/infinistore/lib.py index 39d6c02..a471091 100644 --- a/infinistore/lib.py +++ b/infinistore/lib.py @@ -7,6 +7,7 @@ import asyncio from functools import singledispatchmethod from typing import Optional, Union, List, Tuple +import socket os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" @@ -322,6 +323,7 @@ async def connect_async(self): loop = asyncio.get_running_loop() def blocking_connect(): + self.config.host_addr = self.resolve_hostname(self.config.host_addr) if self.conn.init_connection(self.config) < 0: raise Exception("Failed to initialize remote connection") if self.config.connection_type == TYPE_RDMA: @@ -331,6 +333,25 @@ def blocking_connect(): await loop.run_in_executor(None, blocking_connect) + @staticmethod + def resolve_hostname(hostname: str) -> str: + try: + socket.inet_aton(hostname) + return hostname + except socket.error: + pass + + # If the hostname is not an IP address, resolve it + Logger.info(f"Resolving hostname: {hostname}") + try: + infos = socket.getaddrinfo( + hostname, None, socket.AF_INET, socket.SOCK_STREAM + ) + # Return the first resolved IPv4 address + return infos[0][4][0] + except socket.gaierror as e: + raise Exception(f"Failed to resolve hostname '{hostname}': {e}") + def connect(self): """ Establishes a connection to the Infinistore instance based on the configuration. @@ -343,7 +364,9 @@ def connect(self): if self.rdma_connected: raise Exception("Already connected to remote instance") - print(f"connecting to {self.config.host_addr}") + self.config.host_addr = self.resolve_hostname(self.config.host_addr) + + # check if the hostname is valid ret = self.conn.init_connection(self.config) if ret < 0: raise Exception("Failed to initialize remote connection") diff --git a/src/libinfinistore.cpp b/src/libinfinistore.cpp index 36f042e..ce287d8 100644 --- a/src/libinfinistore.cpp +++ b/src/libinfinistore.cpp @@ -441,10 +441,11 @@ int Connection::init_connection(client_config_t config) { // always connect to localhost if (inet_pton(AF_INET, config.host_addr.data(), &serv_addr.sin_addr) <= 0) { - ERROR("Invalid address/ Address not supported"); + ERROR("Invalid address/ Address not supported {}", config.host_addr); return -1; } + INFO("Connecting to {}:{}", config.host_addr, config.service_port); if (connect(sock_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { ERROR("Failed to connect to server"); return -1; From ef9128c2e9ba7550668d1c047e384fea3b23b06a Mon Sep 17 00:00:00 2001 From: thesues Date: Tue, 6 May 2025 18:13:46 +0000 Subject: [PATCH 07/12] remove rdma deprecated code --- src/infinistore.cpp | 193 -------------------------------- src/infinistore.h | 10 +- src/libinfinistore.cpp | 245 ----------------------------------------- src/libinfinistore.h | 17 --- 4 files changed, 1 insertion(+), 464 deletions(-) diff --git a/src/infinistore.cpp b/src/infinistore.cpp index 6e29bc5..1ef22db 100644 --- a/src/infinistore.cpp +++ b/src/infinistore.cpp @@ -704,74 +704,6 @@ void on_write(uv_write_t *req, int status) { free(req); } -/* -int init_rdma_context(server_config_t config) { - struct ibv_device **dev_list; - struct ibv_device *ib_dev; - int num_devices; - dev_list = ibv_get_device_list(&num_devices); - if (!dev_list) { - ERROR("Failed to get RDMA devices list"); - return -1; - } - - for (int i = 0; i < num_devices; ++i) { - char *dev_name_from_list = (char *)ibv_get_device_name(dev_list[i]); - if (strcmp(dev_name_from_list, config.dev_name.c_str()) == 0) { - INFO("found device {}", dev_name_from_list); - ib_dev = dev_list[i]; - ib_ctx = ibv_open_device(ib_dev); - break; - } - } - - if (!ib_ctx) { - INFO( - "Can't find or failed to open the specified device, try to open " - "the default device {}", - (char *)ibv_get_device_name(dev_list[0])); - ib_ctx = ibv_open_device(dev_list[0]); - if (!ib_ctx) { - ERROR("Failed to open the default device"); - return -1; - } - } - - struct ibv_port_attr port_attr; - ib_port = config.ib_port; - if (ibv_query_port(ib_ctx, ib_port, &port_attr)) { - ERROR("Unable to query port {} attributes\n", ib_port); - return -1; - } - if ((port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && config.link_type == "Ethernet") || - (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && config.link_type == "IB")) { - ERROR("port link layer and config link type don't match"); - return -1; - } - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { - gidx = -1; - } - else { - gidx = ibv_find_sgid_type(ib_ctx, ib_port, IBV_GID_TYPE_ROCE_V2, AF_INET); - if (gidx < 0) { - ERROR("Failed to find GID"); - return -1; - } - } - - lid = port_attr.lid; - active_mtu = port_attr.active_mtu; - - pd = ibv_alloc_pd(ib_ctx); - if (!pd) { - ERROR("Failed to allocate PD"); - return -1; - } - - return 0; -} -*/ - int Client::rdma_exchange() { INFO("do rdma exchange..."); @@ -802,128 +734,6 @@ int Client::rdma_exchange() { return -1; } - // comp_channel_ = ibv_create_comp_channel(ib_ctx); - // if (!comp_channel_) { - // ERROR("Failed to create completion channel"); - // return -1; - // } - - // // RDMA setup if not already done - // assert(comp_channel_ != NULL); - - // cq_ = ibv_create_cq(ib_ctx, MAX_SEND_WR + MAX_RECV_WR, NULL, comp_channel_, 0); - // if (!cq_) { - // ERROR("Failed to create CQ"); - // return SYSTEM_ERROR; - // } - - // // Create Queue Pair - // struct ibv_qp_init_attr qp_init_attr = {}; - // qp_init_attr.send_cq = cq_; - // qp_init_attr.recv_cq = cq_; - // qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection - // qp_init_attr.cap.max_send_wr = MAX_SEND_WR; - // qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; - // qp_init_attr.cap.max_send_sge = 1; - // qp_init_attr.cap.max_recv_sge = 1; - - // qp_ = ibv_create_qp(pd, &qp_init_attr); - // if (!qp_) { - // ERROR("Failed to create QP"); - // return SYSTEM_ERROR; - // } - // // Modify QP to INIT state - // struct ibv_qp_attr attr = {}; - // attr.qp_state = IBV_QPS_INIT; - // attr.port_num = ib_port; - // attr.pkey_index = 0; - // attr.qp_access_flags = - // IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; - - // int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; - - // ret = ibv_modify_qp(qp_, &attr, flags); - // if (ret) { - // ERROR("Failed to modify QP to INIT"); - // return SYSTEM_ERROR; - // } - - // union ibv_gid gid; - // // get gid - // if (gidx != -1 && ibv_query_gid(ib_ctx, 1, gidx, &gid)) { - // ERROR("Failed to get GID"); - // return SYSTEM_ERROR; - // } - - // local_info_.qpn = qp_->qp_num; - // local_info_.psn = lrand48() & 0xffffff; - // local_info_.gid = gid; - // local_info_.lid = lid; - // local_info_.mtu = (uint32_t)active_mtu; - - // INFO("gid index: {}", gidx); - // print_rdma_conn_info(&local_info_, false); - // print_rdma_conn_info(&remote_info_, true); - - // // update MTU - // if (remote_info_.mtu != (uint32_t)active_mtu) { - // WARN("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", - // 1 << ((uint32_t)remote_info_.mtu + 7), 1 << ((uint32_t)active_mtu + 7)); - // } - - // // Modify QP to RTR state - // memset(&attr, 0, sizeof(attr)); - // attr.qp_state = IBV_QPS_RTR; - // attr.path_mtu = (enum ibv_mtu)std::min((uint32_t)active_mtu, (uint32_t)remote_info_.mtu); - // attr.dest_qp_num = remote_info_.qpn; - // attr.rq_psn = remote_info_.psn; - // attr.max_dest_rd_atomic = 16; - // attr.min_rnr_timer = 12; - // attr.ah_attr.dlid = 0; // RoCE v2 is used. - // attr.ah_attr.sl = 0; - // attr.ah_attr.src_path_bits = 0; - // attr.ah_attr.port_num = ib_port; - - // if (gidx == -1) { - // // IB - // attr.ah_attr.dlid = remote_info_.lid; - // attr.ah_attr.is_global = 0; - // } - // else { - // // RoCE v2 - // attr.ah_attr.is_global = 1; - // attr.ah_attr.grh.dgid = remote_info_.gid; - // attr.ah_attr.grh.sgid_index = gidx; - // attr.ah_attr.grh.hop_limit = 1; - // } - - // flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | - // IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; - - // ret = ibv_modify_qp(qp_, &attr, flags); - // if (ret) { - // ERROR("Failed to modify QP to RTR: reason: {}", strerror(ret)); - // return SYSTEM_ERROR; - // } - - // // Modify QP to RTS state - // memset(&attr, 0, sizeof(attr)); - // attr.qp_state = IBV_QPS_RTS; - // attr.timeout = 14; - // attr.retry_cnt = 7; - // attr.rnr_retry = 7; - // attr.sq_psn = local_info_.psn; - // attr.max_rd_atomic = 16; - - // flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | - // IBV_QP_MAX_QP_RD_ATOMIC; - - // ret = ibv_modify_qp(qp_, &attr, flags); - // if (ret) { - // ERROR("Failed to modify QP to RTS"); - // return SYSTEM_ERROR; - // } - INFO("RDMA exchange done"); rdma_connected_ = true; @@ -1239,9 +1049,6 @@ int register_server(unsigned long loop_ptr, server_config_t config) { return -1; } - // if (init_rdma_context(config) < 0) { - // return -1; - // } INFO("open rdma device {}, link_type {}, hint_gid_index {},", config.dev_name, config.link_type, config.hint_gid_index); if (open_rdma_device(config.dev_name, config.ib_port, config.link_type, config.hint_gid_index, diff --git a/src/infinistore.h b/src/infinistore.h index f1453e8..4779625 100644 --- a/src/infinistore.h +++ b/src/infinistore.h @@ -14,16 +14,8 @@ extern server_config_t global_config; extern uv_loop_t *loop; extern uv_tcp_t server; extern struct rdma_device rdma_dev; -// global ibv context -// extern struct ibv_context *ib_ctx; -// extern struct ibv_pd *pd; -extern MM *mm; -// extern int gidx; -// extern int lid; -// extern uint8_t ib_port; -// // local active_mtu attr, after exchanging with remote, we will use the min of the two for -// path.mtu extern ibv_mtu active_mtu; +extern MM *mm; // indicate if the MM extend is in flight extern bool extend_in_flight; diff --git a/src/libinfinistore.cpp b/src/libinfinistore.cpp index ce287d8..3f5f79d 100644 --- a/src/libinfinistore.cpp +++ b/src/libinfinistore.cpp @@ -98,189 +98,8 @@ Connection::~Connection() { destroy_rdma_context(&ctx_); close_rdma_device(&rdma_dev_); - - // if (recv_mr_) { - // ibv_dereg_mr(recv_mr_); - // } - - // if (recv_buffer_) { - // free(recv_buffer_); - // } - - // if (qp_) { - // struct ibv_qp_attr attr; - // memset(&attr, 0, sizeof(attr)); - // attr.qp_state = IBV_QPS_RESET; - // ibv_modify_qp(qp_, &attr, IBV_QP_STATE); - // } - // if (qp_) { - // ibv_destroy_qp(qp_); - // } - // if (cq_) { - // ibv_destroy_cq(cq_); - // } - - // if (comp_channel_) { - // ibv_destroy_comp_channel(comp_channel_); - // } - // if (pd_) { - // ibv_dealloc_pd(pd_); - // } - // if (ib_ctx_) { - // ibv_close_device(ib_ctx_); - // } } -// int Connection::init_rdma_resources(client_config_t config) { -// // Get list of RDMA devices -// struct ibv_device **dev_list; -// struct ibv_device *ib_dev; -// int num_devices; - -// dev_list = ibv_get_device_list(&num_devices); -// if (!dev_list) { -// ERROR("Failed to get RDMA devices list"); -// return -1; -// } - -// for (int i = 0; i < num_devices; ++i) { -// char *dev_name_from_list = (char *)ibv_get_device_name(dev_list[i]); -// if (strcmp(dev_name_from_list, config.dev_name.c_str()) == 0) { -// INFO("found device {}", dev_name_from_list); -// ib_dev = dev_list[i]; -// ib_ctx_ = ibv_open_device(ib_dev); -// break; -// } -// } - -// if (!ib_ctx_) { -// INFO( -// "Can't find or failed to open the specified device, try to open " -// "the default device {}", -// (char *)ibv_get_device_name(dev_list[0])); -// ib_ctx_ = ibv_open_device(dev_list[0]); -// if (!ib_ctx_) { -// ERROR("Failed to open the default device"); -// return -1; -// } -// } -// ibv_free_device_list(dev_list); - -// struct ibv_port_attr port_attr; -// ib_port_ = config.ib_port; -// if (ibv_query_port(ib_ctx_, ib_port_, &port_attr)) { -// ERROR("Unable to query port {} attributes\n", ib_port_); -// return -1; -// } - -// int gidx = 0; -// if ((port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND && config.link_type == "Ethernet") || -// (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET && config.link_type == "IB")) { -// ERROR("port link layer and config link type don't match"); -// return -1; -// } -// if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { -// gidx = -1; -// } -// else { -// gidx = ibv_find_sgid_type(ib_ctx_, ib_port_, IBV_GID_TYPE_ROCE_V2, AF_INET); -// if (gidx < 0) { -// ERROR("Failed to find GID"); -// return -1; -// } -// } - -// lid_ = port_attr.lid; -// gidx_ = gidx; - -// active_mtu_ = port_attr.active_mtu; - -// union ibv_gid gid; -// // get gid -// if (gidx_ != -1 && ibv_query_gid(ib_ctx_, 1, gidx_, &gid)) { -// ERROR("Failed to get GID"); -// return -1; -// } - -// // Allocate Protection Domain -// pd_ = ibv_alloc_pd(ib_ctx_); -// if (!pd_) { -// ERROR("Failed to allocate PD"); -// return -1; -// } - -// comp_channel_ = ibv_create_comp_channel(ib_ctx_); -// if (!comp_channel_) { -// ERROR("Failed to create completion channel"); -// return -1; -// } - -// // Create Completion Queue -// cq_ = ibv_create_cq(ib_ctx_, MAX_SEND_WR + MAX_RECV_WR, NULL, comp_channel_, 0); -// if (!cq_) { -// ERROR("Failed to create CQ"); -// return -1; -// } - -// if (ibv_req_notify_cq(cq_, 0)) { -// ERROR("Failed to request CQ notification"); -// return -1; -// } - -// // Create Queue Pair -// struct ibv_qp_init_attr qp_init_attr = {}; -// qp_init_attr.send_cq = cq_; -// qp_init_attr.recv_cq = cq_; -// qp_init_attr.qp_type = IBV_QPT_RC; // Reliable Connection -// qp_init_attr.cap.max_send_wr = MAX_SEND_WR; -// qp_init_attr.cap.max_recv_wr = MAX_RECV_WR; -// qp_init_attr.cap.max_send_sge = 1; -// qp_init_attr.cap.max_recv_sge = 1; - -// qp_ = ibv_create_qp(pd_, &qp_init_attr); -// if (!qp_) { -// ERROR("Failed to create QP, {}", strerror(errno)); -// return -1; -// } - -// // Modify QP to INIT state -// if (modify_qp_to_init()) { -// ERROR("Failed to modify QP to INIT, {}", strerror(errno)); -// return -1; -// } - -// local_info_.qpn = qp_->qp_num; -// local_info_.psn = lrand48() & 0xffffff; -// if (gidx != -1) { -// local_info_.gid = gid; -// DEBUG("gid index: {}", gidx); -// } -// local_info_.lid = lid_; - -// local_info_.mtu = (uint32_t)active_mtu_; - -// print_rdma_conn_info(&local_info_, false); -// return 0; -// } - -// int Connection::modify_qp_to_init() { -// struct ibv_qp_attr attr = {}; -// attr.qp_state = IBV_QPS_INIT; -// attr.port_num = ib_port_; -// attr.pkey_index = 0; -// attr.qp_access_flags = -// IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE; - -// int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS; - -// int ret = ibv_modify_qp(qp_, &attr, flags); -// if (ret) { -// ERROR("Failed to modify QP to INIT"); -// return ret; -// } -// return 0; -// } - void Connection::cq_handler() { assert(ctx_.comp_channel != NULL); @@ -453,70 +272,6 @@ int Connection::init_connection(client_config_t config) { return 0; } -// int Connection::modify_qp_to_rtr() { -// struct ibv_qp_attr attr = {}; -// attr.qp_state = IBV_QPS_RTR; - -// // update MTU -// if (remote_info_.mtu != active_mtu_) { -// WARN("remote MTU: {}, local MTU: {} is not the same, update to minimal MTU", -// 1 << ((uint32_t)remote_info_.mtu + 7), 1 << ((uint32_t)active_mtu_ + 7)); -// } -// attr.path_mtu = (enum ibv_mtu)std::min((uint32_t)active_mtu_, (uint32_t)remote_info_.mtu); - -// attr.dest_qp_num = remote_info_.qpn; -// attr.rq_psn = remote_info_.psn; -// attr.max_dest_rd_atomic = 16; -// attr.min_rnr_timer = 12; -// attr.ah_attr.dlid = 0; -// attr.ah_attr.sl = 0; -// attr.ah_attr.src_path_bits = 0; -// attr.ah_attr.port_num = ib_port_; - -// if (gidx_ == -1) { -// // IB -// attr.ah_attr.dlid = remote_info_.lid; -// attr.ah_attr.is_global = 0; -// } -// else { -// // RoCE v2 -// attr.ah_attr.is_global = 1; -// attr.ah_attr.grh.dgid = remote_info_.gid; -// attr.ah_attr.grh.sgid_index = gidx_; // local gid -// attr.ah_attr.grh.hop_limit = 1; -// } - -// int flags = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | -// IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER; - -// int ret = ibv_modify_qp(qp_, &attr, flags); -// if (ret) { -// ERROR("Failed to modify QP to RTR"); -// return ret; -// } -// return 0; -// } - -// int Connection::modify_qp_to_rts() { -// struct ibv_qp_attr attr = {}; -// attr.qp_state = IBV_QPS_RTS; -// attr.timeout = 14; -// attr.retry_cnt = 7; -// attr.rnr_retry = 7; -// attr.sq_psn = local_info_.psn; // Use 0 or match with local PSN -// attr.max_rd_atomic = 16; - -// int flags = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | -// IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC; - -// int ret = ibv_modify_qp(qp_, &attr, flags); -// if (ret) { -// ERROR("Failed to modify QP to RTS"); -// return ret; -// } -// return 0; -// } - int Connection::exchange_conn_info() { header_t header = { .magic = MAGIC, diff --git a/src/libinfinistore.h b/src/libinfinistore.h index c2fd612..e263a3f 100644 --- a/src/libinfinistore.h +++ b/src/libinfinistore.h @@ -64,19 +64,6 @@ class Connection { // tcp socket int sock_ = 0; - // rdma connections - // struct ibv_context *ib_ctx_ = NULL; - // struct ibv_pd *pd_ = NULL; - // struct ibv_cq *cq_ = NULL; - // struct ibv_qp *qp_ = NULL; - // int gidx_ = -1; - // int lid_ = -1; - // uint8_t ib_port_ = -1; - - // // local active_mtu attr, after exchanging with remote, we will use the min of the two for - // // path.mtu - // ibv_mtu active_mtu_; - struct rdma_device rdma_dev_; struct rdma_context ctx_; @@ -118,11 +105,7 @@ class Connection { int delete_keys(const std::vector &keys); int register_mr(void *base_ptr, size_t ptr_region_size); - // int modify_qp_to_init(); - // int modify_qp_to_rts(); - // int modify_qp_to_rtr(); int exchange_conn_info(); - // int init_rdma_resources(client_config_t config); void post_recv_ack(rdma_info_base *info); From 1e68f33f2b6f78e5cd9beec6497c9e7ce07c50a0 Mon Sep 17 00:00:00 2001 From: thesues Date: Fri, 9 May 2025 17:40:49 +0000 Subject: [PATCH 08/12] fix --- src/infinistore.cpp | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/src/infinistore.cpp b/src/infinistore.cpp index 1ef22db..d5cf5a3 100644 --- a/src/infinistore.cpp +++ b/src/infinistore.cpp @@ -27,16 +27,8 @@ server_config_t global_config; uv_loop_t *loop; uv_tcp_t server; -// global ibv context -// struct ibv_context *ib_ctx; -// struct ibv_pd *pd; -MM *mm; -// int gidx = 0; -// int lid = -1; -// uint8_t ib_port = -1; -// // local active_mtu attr, after exchanging with remote, we will use the min of the two for -// path.mtu ibv_mtu active_mtu; +MM *mm; struct rdma_device rdma_dev; @@ -86,16 +78,10 @@ struct Client { rdma_conn_info_t remote_info_; rdma_conn_info_t local_info_; - // struct ibv_cq *cq_ = NULL; - // struct ibv_qp *qp_ = NULL; bool rdma_connected_ = false; - // struct ibv_comp_channel *comp_channel_ = NULL; rdma_context rdma_ctx_; - // // notify thread new request - // uv_sem_t sem_; - uv_poll_t poll_handle_; Client() = default; @@ -170,27 +156,6 @@ Client::~Client() { } destroy_rdma_context(&rdma_ctx_); - // if (qp_) { - // struct ibv_qp_attr attr; - // memset(&attr, 0, sizeof(attr)); - // attr.qp_state = IBV_QPS_RESET; - // if (ibv_modify_qp(qp_, &attr, IBV_QP_STATE)) { - // ERROR("Failed to modify QP to ERR state"); - // } - // } - // if (qp_) { - // ibv_destroy_qp(qp_); - // qp_ = NULL; - // } - // if (cq_) { - // ibv_destroy_cq(cq_); - // cq_ = NULL; - // } - - // if (comp_channel_) { - // ibv_destroy_comp_channel(comp_channel_); - // comp_channel_ = NULL; - // } } void on_close(uv_handle_t *handle) { From bd5b20d1c60cd2a7bb845073c145990704088186 Mon Sep 17 00:00:00 2001 From: thesues Date: Sat, 10 May 2025 08:36:50 +0000 Subject: [PATCH 09/12] GOOD --- build_manylinux_wheels.sh | 23 ++++++++----------- setup.py | 48 +++++++++++++++++++++++---------------- src/Makefile | 44 ----------------------------------- 3 files changed, 38 insertions(+), 77 deletions(-) delete mode 100644 src/Makefile diff --git a/build_manylinux_wheels.sh b/build_manylinux_wheels.sh index 94511a1..49200dd 100644 --- a/build_manylinux_wheels.sh +++ b/build_manylinux_wheels.sh @@ -1,20 +1,15 @@ PYTHON_VERSIONS=( - "/opt/python/cp310-cp310/bin/python3.10" - "/opt/python/cp311-cp311/bin/python3.11" - "/opt/python/cp312-cp312/bin/python3.12" + "/opt/python/cp310-cp310/bin/python3.10" + "/opt/python/cp311-cp311/bin/python3.11" + "/opt/python/cp312-cp312/bin/python3.12" ) -rm -rf build/ dist/ wheelhouse/ + +rm -rf dist/ wheelhouse/ +OLDPATH=$PATH for PYTHON in "${PYTHON_VERSIONS[@]}"; do - make -C src clean - rm -rf infinistore/*.so - make -C src manylinux PYTHON=${PYTHON} -j8 - if [ $? -ne 0 ]; then - exit 1 - fi - unset LD_LIBRARY_PATH - export LD_LIBRARY_PATH=/usr/local/lib - #ldd check, auditwheel will also check LD_LIBRARY_PATH - ldd src/*.so + rm -rf build/ + BINDIR="$(dirname $PYTHON)" + export PATH="$BINDIR:$OLDPATH" ${PYTHON} setup.py bdist_wheel #runtime will install ibverbs, so exclude it WHEEL_FILE=$(ls dist/*.whl) diff --git a/setup.py b/setup.py index 589d2ba..f2b0617 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ import subprocess from setuptools import setup, find_packages, Extension from setuptools.command.build_ext import build_ext -import sys def get_version(): @@ -30,26 +29,29 @@ def get_version(): # invoke the make command to build the shared library class CustomBuildExt(build_ext): def run(self): + import glob + import shutil + import os + + subprocess.check_call(["meson", "setup", "build", "--wipe"], cwd="src") + subprocess.check_call(["ninja"], cwd="src/build") + + so_files = glob.glob("src/build/_infinistore*.so") if self.inplace: - # developer mode - print("developer mode: building shared library") - subprocess.check_call(["make", "clean"], cwd="src") - subprocess.check_call(["make"], cwd="src") - super().run() + for so_file in so_files: + dest = os.path.join("infinistore", os.path.basename(so_file)) + print(f"Copying {so_file} to {dest}") + shutil.copy(so_file, dest) else: - # package mode, return. build.sh script will build the shared library - return - + build_dir = os.path.join(self.build_lib, "infinistore") + for so_file in so_files: + build_dest = os.path.join(build_dir, os.path.basename(so_file)) + print(f"Copying {so_file} to build directory: {build_dest}") + shutil.copy(so_file, build_dest) -ext_modules = [] -if "bdist_wheel" in sys.argv: - # this dummy extension is only for the wheel package - # so wheel package will have Python ABI dependency for wheel package. - # this is to prevent from strange error when do 'pip install -e .' - # fix this error if you have better solution - cpp_extension = Extension(name="infinistore.dummy", sources=[]) - ext_modules = [cpp_extension] +cpp_extension = Extension(name="infinistore._infinistore", sources=[""]) +ext_modules = [cpp_extension] setup( name="infinistore", @@ -57,9 +59,17 @@ def run(self): packages=find_packages(), cmdclass={"build_ext": CustomBuildExt}, package_data={ - "infinistore": ["*.so"], + "infinistore": ["_infinistore*.so"], }, - install_requires=["torch", "uvloop", "fastapi", "pybind11", "uvicorn", "numpy"], + install_requires=[ + "uvloop", + "fastapi", + "pybind11", + "uvicorn", + "numpy", + "meson", + "ninja", + ], description="A kvcache memory pool", long_description=open("README.md").read(), long_description_content_type="text/markdown", diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index bb22870..0000000 --- a/src/Makefile +++ /dev/null @@ -1,44 +0,0 @@ -CXX = g++ -CXXFLAGS = -std=c++17 -Wall -O3 -g - -INCLUDES = -I/usr/local/ -LDFLAGS = -rdynamic -LIBS = -luv -libverbs -lfmt -lboost_stacktrace_basic -ldl -PYTHON=python3 -PYBIND11_INCLUDES = $(shell $(PYTHON) -m pybind11 --includes) -PYTHON_EXTENSION_SUFFIX = $(shell $(PYTHON)-config --extension-suffix) - -PYBIND_TARGET= _infinistore$(PYTHON_EXTENSION_SUFFIX) - -SOURCES := $(wildcard *.cpp) -OBJECTS = $(SOURCES:.cpp=.o) - -all:$(PYBIND_TARGET) - -manylinux: PYTHON ?= python3.11 -manylinux: CXXFLAGS = -std=c++17 -g -O3 -Wall -manylinux: LIBS = -luv -libverbs -lfmt -lboost_stacktrace_basic -ldl -manylinux: INCLUDES += -I/usr/local/include -manylinux: PYBIND11_INCLUDES = $(shell $(PYTHON) -m pybind11 --includes) -manylinux: PYTHON_EXTENSION_SUFFIX = $(shell $(PYTHON)-config --extension-suffix) -manylinux: PYBIND_TARGET = _infinistore$(PYTHON_EXTENSION_SUFFIX) -manylinux: $(PYBIND_TARGET) - -%.o: %.cpp - $(CXX) $(CXXFLAGS) $(INCLUDES) -MMD -MP -fPIC -c $< -o $@ - -%_generated.h: %.fbs - flatc --cpp $< - --include $(OBJECTS:.o=.d) - -$(PYBIND_TARGET): pybind.cpp libinfinistore.o utils.o protocol.o infinistore.o log.o ibv_helper.o mempool.o rdma.o - $(CXX) $(CXXFLAGS) $(INCLUDES) --shared -fPIC $(PYBIND11_INCLUDES) $^ \ - -o $(PYBIND_TARGET) $(LDFLAGS) $(LIBS) - rm -rf ../infinistore/$(PYBIND_TARGET) - cp $(PYBIND_TARGET) ../infinistore/ - - -.PHONY: clean test -clean: - rm -rf *.so *.o *.d test_client From de5b048537e14f9b8369602b963941fb86de868c Mon Sep 17 00:00:00 2001 From: thesues Date: Sat, 10 May 2025 09:43:57 +0000 Subject: [PATCH 10/12] wheel build --- build_manylinux_wheels.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build_manylinux_wheels.sh b/build_manylinux_wheels.sh index 49200dd..74c25c6 100644 --- a/build_manylinux_wheels.sh +++ b/build_manylinux_wheels.sh @@ -4,10 +4,12 @@ PYTHON_VERSIONS=( "/opt/python/cp312-cp312/bin/python3.12" ) +#clean up inplace build +rm infinistore/*.so + rm -rf dist/ wheelhouse/ OLDPATH=$PATH for PYTHON in "${PYTHON_VERSIONS[@]}"; do - rm -rf build/ BINDIR="$(dirname $PYTHON)" export PATH="$BINDIR:$OLDPATH" ${PYTHON} setup.py bdist_wheel From b970a730bfaa8880d287be11dc6c0eb46aa3eb41 Mon Sep 17 00:00:00 2001 From: thesues Date: Sat, 10 May 2025 10:18:50 +0000 Subject: [PATCH 11/12] all --- build_manylinux_wheels.sh | 1 + setup.py | 2 - src/allocate_response_generated.h | 110 ------------------- src/delete_keys_generated.h | 84 --------------- src/get_match_last_index_generated.h | 84 --------------- src/local_meta_generated.h | 153 --------------------------- src/meson.build | 71 +++++++++++++ src/meta_request.fbs | 1 + src/meta_request_generated.h | 116 -------------------- src/tcp_payload_request_generated.h | 94 ---------------- 10 files changed, 73 insertions(+), 643 deletions(-) delete mode 100644 src/allocate_response_generated.h delete mode 100644 src/delete_keys_generated.h delete mode 100644 src/get_match_last_index_generated.h delete mode 100644 src/local_meta_generated.h create mode 100644 src/meson.build delete mode 100644 src/meta_request_generated.h delete mode 100644 src/tcp_payload_request_generated.h diff --git a/build_manylinux_wheels.sh b/build_manylinux_wheels.sh index 74c25c6..4f7a198 100644 --- a/build_manylinux_wheels.sh +++ b/build_manylinux_wheels.sh @@ -12,6 +12,7 @@ OLDPATH=$PATH for PYTHON in "${PYTHON_VERSIONS[@]}"; do BINDIR="$(dirname $PYTHON)" export PATH="$BINDIR:$OLDPATH" + pip install meson ninja ${PYTHON} setup.py bdist_wheel #runtime will install ibverbs, so exclude it WHEEL_FILE=$(ls dist/*.whl) diff --git a/setup.py b/setup.py index f2b0617..d09e2dd 100644 --- a/setup.py +++ b/setup.py @@ -67,8 +67,6 @@ def run(self): "pybind11", "uvicorn", "numpy", - "meson", - "ninja", ], description="A kvcache memory pool", long_description=open("README.md").read(), diff --git a/src/allocate_response_generated.h b/src/allocate_response_generated.h deleted file mode 100644 index 472d3c3..0000000 --- a/src/allocate_response_generated.h +++ /dev/null @@ -1,110 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - -#ifndef FLATBUFFERS_GENERATED_ALLOCATERESPONSE_H_ -#define FLATBUFFERS_GENERATED_ALLOCATERESPONSE_H_ - -#include "flatbuffers/flatbuffers.h" - -struct RemoteBlock; - -struct RdmaAllocateResponse; -struct RdmaAllocateResponseBuilder; - -FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) RemoteBlock FLATBUFFERS_FINAL_CLASS { - private: - uint32_t rkey_; - int32_t padding0__; - uint64_t remote_addr_; - - public: - RemoteBlock() : rkey_(0), padding0__(0), remote_addr_(0) { (void)padding0__; } - RemoteBlock(uint32_t _rkey, uint64_t _remote_addr) - : rkey_(flatbuffers::EndianScalar(_rkey)), - padding0__(0), - remote_addr_(flatbuffers::EndianScalar(_remote_addr)) {} - uint32_t rkey() const { return flatbuffers::EndianScalar(rkey_); } - uint64_t remote_addr() const { return flatbuffers::EndianScalar(remote_addr_); } -}; -FLATBUFFERS_STRUCT_END(RemoteBlock, 16); - -struct RdmaAllocateResponse FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef RdmaAllocateResponseBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_BLOCKS = 4, - VT_ERROR_CODE = 6 - }; - const flatbuffers::Vector *blocks() const { - return GetPointer *>(VT_BLOCKS); - } - uint32_t error_code() const { return GetField(VT_ERROR_CODE, 0); } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_BLOCKS) && - verifier.VerifyVector(blocks()) && VerifyField(verifier, VT_ERROR_CODE) && - verifier.EndTable(); - } -}; - -struct RdmaAllocateResponseBuilder { - typedef RdmaAllocateResponse Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_blocks(flatbuffers::Offset> blocks) { - fbb_.AddOffset(RdmaAllocateResponse::VT_BLOCKS, blocks); - } - void add_error_code(uint32_t error_code) { - fbb_.AddElement(RdmaAllocateResponse::VT_ERROR_CODE, error_code, 0); - } - explicit RdmaAllocateResponseBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateRdmaAllocateResponse( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> blocks = 0, - uint32_t error_code = 0) { - RdmaAllocateResponseBuilder builder_(_fbb); - builder_.add_error_code(error_code); - builder_.add_blocks(blocks); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateRdmaAllocateResponseDirect( - flatbuffers::FlatBufferBuilder &_fbb, const std::vector *blocks = nullptr, - uint32_t error_code = 0) { - auto blocks__ = blocks ? _fbb.CreateVectorOfStructs(*blocks) : 0; - return CreateRdmaAllocateResponse(_fbb, blocks__, error_code); -} - -inline const RdmaAllocateResponse *GetRdmaAllocateResponse(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const RdmaAllocateResponse *GetSizePrefixedRdmaAllocateResponse(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline bool VerifyRdmaAllocateResponseBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedRdmaAllocateResponseBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishRdmaAllocateResponseBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedRdmaAllocateResponseBuffer( - flatbuffers::FlatBufferBuilder &fbb, flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -#endif // FLATBUFFERS_GENERATED_ALLOCATERESPONSE_H_ diff --git a/src/delete_keys_generated.h b/src/delete_keys_generated.h deleted file mode 100644 index 3184cc9..0000000 --- a/src/delete_keys_generated.h +++ /dev/null @@ -1,84 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - -#ifndef FLATBUFFERS_GENERATED_DELETEKEYS_H_ -#define FLATBUFFERS_GENERATED_DELETEKEYS_H_ - -#include "flatbuffers/flatbuffers.h" - -struct DeleteKeysRequest; -struct DeleteKeysRequestBuilder; - -struct DeleteKeysRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef DeleteKeysRequestBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEYS = 4 }; - const flatbuffers::Vector> *keys() const { - return GetPointer> *>( - VT_KEYS); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEYS) && - verifier.VerifyVector(keys()) && verifier.VerifyVectorOfStrings(keys()) && - verifier.EndTable(); - } -}; - -struct DeleteKeysRequestBuilder { - typedef DeleteKeysRequest Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_keys( - flatbuffers::Offset>> keys) { - fbb_.AddOffset(DeleteKeysRequest::VT_KEYS, keys); - } - explicit DeleteKeysRequestBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateDeleteKeysRequest( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset>> keys = 0) { - DeleteKeysRequestBuilder builder_(_fbb); - builder_.add_keys(keys); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateDeleteKeysRequestDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector> *keys = nullptr) { - auto keys__ = keys ? _fbb.CreateVector>(*keys) : 0; - return CreateDeleteKeysRequest(_fbb, keys__); -} - -inline const DeleteKeysRequest *GetDeleteKeysRequest(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const DeleteKeysRequest *GetSizePrefixedDeleteKeysRequest(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline bool VerifyDeleteKeysRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedDeleteKeysRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishDeleteKeysRequestBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedDeleteKeysRequestBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -#endif // FLATBUFFERS_GENERATED_DELETEKEYS_H_ diff --git a/src/get_match_last_index_generated.h b/src/get_match_last_index_generated.h deleted file mode 100644 index 1aeee7a..0000000 --- a/src/get_match_last_index_generated.h +++ /dev/null @@ -1,84 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - -#ifndef FLATBUFFERS_GENERATED_GETMATCHLASTINDEX_H_ -#define FLATBUFFERS_GENERATED_GETMATCHLASTINDEX_H_ - -#include "flatbuffers/flatbuffers.h" - -struct GetMatchLastIndexRequest; -struct GetMatchLastIndexRequestBuilder; - -struct GetMatchLastIndexRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef GetMatchLastIndexRequestBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEYS = 4 }; - const flatbuffers::Vector> *keys() const { - return GetPointer> *>( - VT_KEYS); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEYS) && - verifier.VerifyVector(keys()) && verifier.VerifyVectorOfStrings(keys()) && - verifier.EndTable(); - } -}; - -struct GetMatchLastIndexRequestBuilder { - typedef GetMatchLastIndexRequest Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_keys( - flatbuffers::Offset>> keys) { - fbb_.AddOffset(GetMatchLastIndexRequest::VT_KEYS, keys); - } - explicit GetMatchLastIndexRequestBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateGetMatchLastIndexRequest( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset>> keys = 0) { - GetMatchLastIndexRequestBuilder builder_(_fbb); - builder_.add_keys(keys); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateGetMatchLastIndexRequestDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector> *keys = nullptr) { - auto keys__ = keys ? _fbb.CreateVector>(*keys) : 0; - return CreateGetMatchLastIndexRequest(_fbb, keys__); -} - -inline const GetMatchLastIndexRequest *GetGetMatchLastIndexRequest(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const GetMatchLastIndexRequest *GetSizePrefixedGetMatchLastIndexRequest(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline bool VerifyGetMatchLastIndexRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedGetMatchLastIndexRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishGetMatchLastIndexRequestBuffer( - flatbuffers::FlatBufferBuilder &fbb, flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedGetMatchLastIndexRequestBuffer( - flatbuffers::FlatBufferBuilder &fbb, flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -#endif // FLATBUFFERS_GENERATED_GETMATCHLASTINDEX_H_ diff --git a/src/local_meta_generated.h b/src/local_meta_generated.h deleted file mode 100644 index 8840cd9..0000000 --- a/src/local_meta_generated.h +++ /dev/null @@ -1,153 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - -#ifndef FLATBUFFERS_GENERATED_LOCALMETA_H_ -#define FLATBUFFERS_GENERATED_LOCALMETA_H_ - -#include "flatbuffers/flatbuffers.h" - -struct Block; -struct BlockBuilder; - -struct LocalMeta; -struct LocalMetaBuilder; - -struct Block FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef BlockBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEY = 4, VT_OFFSET = 6 }; - const flatbuffers::String *key() const { - return GetPointer(VT_KEY); - } - uint64_t offset() const { return GetField(VT_OFFSET, 0); } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEY) && - verifier.VerifyString(key()) && VerifyField(verifier, VT_OFFSET) && - verifier.EndTable(); - } -}; - -struct BlockBuilder { - typedef Block Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_key(flatbuffers::Offset key) { - fbb_.AddOffset(Block::VT_KEY, key); - } - void add_offset(uint64_t offset) { fbb_.AddElement(Block::VT_OFFSET, offset, 0); } - explicit BlockBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateBlock(flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset key = 0, - uint64_t offset = 0) { - BlockBuilder builder_(_fbb); - builder_.add_offset(offset); - builder_.add_key(key); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateBlockDirect(flatbuffers::FlatBufferBuilder &_fbb, - const char *key = nullptr, - uint64_t offset = 0) { - auto key__ = key ? _fbb.CreateString(key) : 0; - return CreateBlock(_fbb, key__, offset); -} - -struct LocalMeta FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef LocalMetaBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_IPC_HANDLE = 4, - VT_BLOCK_SIZE = 6, - VT_BLOCKS = 8 - }; - const flatbuffers::Vector *ipc_handle() const { - return GetPointer *>(VT_IPC_HANDLE); - } - int32_t block_size() const { return GetField(VT_BLOCK_SIZE, 0); } - const flatbuffers::Vector> *blocks() const { - return GetPointer> *>(VT_BLOCKS); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_IPC_HANDLE) && - verifier.VerifyVector(ipc_handle()) && - VerifyField(verifier, VT_BLOCK_SIZE) && VerifyOffset(verifier, VT_BLOCKS) && - verifier.VerifyVector(blocks()) && verifier.VerifyVectorOfTables(blocks()) && - verifier.EndTable(); - } -}; - -struct LocalMetaBuilder { - typedef LocalMeta Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_ipc_handle(flatbuffers::Offset> ipc_handle) { - fbb_.AddOffset(LocalMeta::VT_IPC_HANDLE, ipc_handle); - } - void add_block_size(int32_t block_size) { - fbb_.AddElement(LocalMeta::VT_BLOCK_SIZE, block_size, 0); - } - void add_blocks(flatbuffers::Offset>> blocks) { - fbb_.AddOffset(LocalMeta::VT_BLOCKS, blocks); - } - explicit LocalMetaBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateLocalMeta( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> ipc_handle = 0, int32_t block_size = 0, - flatbuffers::Offset>> blocks = 0) { - LocalMetaBuilder builder_(_fbb); - builder_.add_blocks(blocks); - builder_.add_block_size(block_size); - builder_.add_ipc_handle(ipc_handle); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateLocalMetaDirect( - flatbuffers::FlatBufferBuilder &_fbb, const std::vector *ipc_handle = nullptr, - int32_t block_size = 0, const std::vector> *blocks = nullptr) { - auto ipc_handle__ = ipc_handle ? _fbb.CreateVector(*ipc_handle) : 0; - auto blocks__ = blocks ? _fbb.CreateVector>(*blocks) : 0; - return CreateLocalMeta(_fbb, ipc_handle__, block_size, blocks__); -} - -inline const LocalMeta *GetLocalMeta(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const LocalMeta *GetSizePrefixedLocalMeta(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline bool VerifyLocalMetaBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedLocalMetaBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishLocalMetaBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedLocalMetaBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -#endif // FLATBUFFERS_GENERATED_LOCALMETA_H_ diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000..eb2510d --- /dev/null +++ b/src/meson.build @@ -0,0 +1,71 @@ +project('infinistore', +['cpp'], +version: '0.0.0', +default_options: [ + 'buildtype=release', + 'cpp_std=c++17', + 'warning_level=2', # -Wall + 'optimization=3', # -O3 + 'debug=true' # -g +]) + +python3 = import('python').find_installation('python3', pure: false) + +# C/C++ compiler interface +cc = meson.get_compiler('cpp') + +fb_sources = [ + 'meta_request.fbs', + 'allocate_response.fbs', + 'delete_keys.fbs', + 'get_match_last_index.fbs', + 'tcp_payload_request.fbs' +] + +flatc = find_program('flatc', required: true) + +generated_headers = [] + +foreach fb_source : fb_sources + fb_name = fb_source.split('.')[0] + header_name = fb_name + '_generated.h' + # 输出到 build/generated 目录 + generated_headers += custom_target( + fb_name + '_gen', + input: fb_source, + output: header_name, + command: [ + flatc, + '--cpp', + '@INPUT@', + ], + ) +endforeach + +# External library dependencies +pybind_dep = dependency('pybind11', required: true) +libuv_dep = dependency('libuv', required: true) +fmt_dep = dependency('fmt', required: true) +boost_stack_dep = dependency('boost', modules: ['stacktrace_basic'], required: true) +ibverbs_dep = dependency('libibverbs', required: true) + +# Source files for the extension +ext_sources = [ + 'libinfinistore.cpp', + 'utils.cpp', + 'protocol.cpp', + 'infinistore.cpp', + 'log.cpp', + 'ibv_helper.cpp', + 'mempool.cpp', + 'rdma.cpp', + 'pybind.cpp' +] + + +# Build the Python extension module +python3.extension_module('_infinistore', + sources: ext_sources + generated_headers, + dependencies: [pybind_dep, libuv_dep, fmt_dep, boost_stack_dep, ibverbs_dep], + install: true +) \ No newline at end of file diff --git a/src/meta_request.fbs b/src/meta_request.fbs index b0b538b..dd8a756 100644 --- a/src/meta_request.fbs +++ b/src/meta_request.fbs @@ -6,4 +6,5 @@ table RemoteMetaRequest { remote_addrs: [ulong]; // GPU addresses op: byte; } + root_type RemoteMetaRequest; diff --git a/src/meta_request_generated.h b/src/meta_request_generated.h deleted file mode 100644 index db02580..0000000 --- a/src/meta_request_generated.h +++ /dev/null @@ -1,116 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - -#ifndef FLATBUFFERS_GENERATED_METAREQUEST_H_ -#define FLATBUFFERS_GENERATED_METAREQUEST_H_ - -#include "flatbuffers/flatbuffers.h" - -struct RemoteMetaRequest; -struct RemoteMetaRequestBuilder; - -struct RemoteMetaRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef RemoteMetaRequestBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_KEYS = 4, - VT_BLOCK_SIZE = 6, - VT_RKEY = 8, - VT_REMOTE_ADDRS = 10, - VT_OP = 12 - }; - const flatbuffers::Vector> *keys() const { - return GetPointer> *>( - VT_KEYS); - } - int32_t block_size() const { return GetField(VT_BLOCK_SIZE, 0); } - uint32_t rkey() const { return GetField(VT_RKEY, 0); } - const flatbuffers::Vector *remote_addrs() const { - return GetPointer *>(VT_REMOTE_ADDRS); - } - int8_t op() const { return GetField(VT_OP, 0); } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEYS) && - verifier.VerifyVector(keys()) && verifier.VerifyVectorOfStrings(keys()) && - VerifyField(verifier, VT_BLOCK_SIZE) && - VerifyField(verifier, VT_RKEY) && - VerifyOffset(verifier, VT_REMOTE_ADDRS) && verifier.VerifyVector(remote_addrs()) && - VerifyField(verifier, VT_OP) && verifier.EndTable(); - } -}; - -struct RemoteMetaRequestBuilder { - typedef RemoteMetaRequest Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_keys( - flatbuffers::Offset>> keys) { - fbb_.AddOffset(RemoteMetaRequest::VT_KEYS, keys); - } - void add_block_size(int32_t block_size) { - fbb_.AddElement(RemoteMetaRequest::VT_BLOCK_SIZE, block_size, 0); - } - void add_rkey(uint32_t rkey) { fbb_.AddElement(RemoteMetaRequest::VT_RKEY, rkey, 0); } - void add_remote_addrs(flatbuffers::Offset> remote_addrs) { - fbb_.AddOffset(RemoteMetaRequest::VT_REMOTE_ADDRS, remote_addrs); - } - void add_op(int8_t op) { fbb_.AddElement(RemoteMetaRequest::VT_OP, op, 0); } - explicit RemoteMetaRequestBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateRemoteMetaRequest( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset>> keys = 0, - int32_t block_size = 0, uint32_t rkey = 0, - flatbuffers::Offset> remote_addrs = 0, int8_t op = 0) { - RemoteMetaRequestBuilder builder_(_fbb); - builder_.add_remote_addrs(remote_addrs); - builder_.add_rkey(rkey); - builder_.add_block_size(block_size); - builder_.add_keys(keys); - builder_.add_op(op); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateRemoteMetaRequestDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector> *keys = nullptr, - int32_t block_size = 0, uint32_t rkey = 0, const std::vector *remote_addrs = nullptr, - int8_t op = 0) { - auto keys__ = keys ? _fbb.CreateVector>(*keys) : 0; - auto remote_addrs__ = remote_addrs ? _fbb.CreateVector(*remote_addrs) : 0; - return CreateRemoteMetaRequest(_fbb, keys__, block_size, rkey, remote_addrs__, op); -} - -inline const RemoteMetaRequest *GetRemoteMetaRequest(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const RemoteMetaRequest *GetSizePrefixedRemoteMetaRequest(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline bool VerifyRemoteMetaRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedRemoteMetaRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishRemoteMetaRequestBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedRemoteMetaRequestBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -#endif // FLATBUFFERS_GENERATED_METAREQUEST_H_ diff --git a/src/tcp_payload_request_generated.h b/src/tcp_payload_request_generated.h deleted file mode 100644 index afe3162..0000000 --- a/src/tcp_payload_request_generated.h +++ /dev/null @@ -1,94 +0,0 @@ -// automatically generated by the FlatBuffers compiler, do not modify - -#ifndef FLATBUFFERS_GENERATED_TCPPAYLOADREQUEST_H_ -#define FLATBUFFERS_GENERATED_TCPPAYLOADREQUEST_H_ - -#include "flatbuffers/flatbuffers.h" - -struct TCPPayloadRequest; -struct TCPPayloadRequestBuilder; - -struct TCPPayloadRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef TCPPayloadRequestBuilder Builder; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_KEY = 4, - VT_VALUE_LENGTH = 6, - VT_OP = 8 - }; - const flatbuffers::String *key() const { - return GetPointer(VT_KEY); - } - int32_t value_length() const { return GetField(VT_VALUE_LENGTH, 0); } - int8_t op() const { return GetField(VT_OP, 0); } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEY) && - verifier.VerifyString(key()) && VerifyField(verifier, VT_VALUE_LENGTH) && - VerifyField(verifier, VT_OP) && verifier.EndTable(); - } -}; - -struct TCPPayloadRequestBuilder { - typedef TCPPayloadRequest Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_key(flatbuffers::Offset key) { - fbb_.AddOffset(TCPPayloadRequest::VT_KEY, key); - } - void add_value_length(int32_t value_length) { - fbb_.AddElement(TCPPayloadRequest::VT_VALUE_LENGTH, value_length, 0); - } - void add_op(int8_t op) { fbb_.AddElement(TCPPayloadRequest::VT_OP, op, 0); } - explicit TCPPayloadRequestBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateTCPPayloadRequest( - flatbuffers::FlatBufferBuilder &_fbb, flatbuffers::Offset key = 0, - int32_t value_length = 0, int8_t op = 0) { - TCPPayloadRequestBuilder builder_(_fbb); - builder_.add_value_length(value_length); - builder_.add_key(key); - builder_.add_op(op); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateTCPPayloadRequestDirect( - flatbuffers::FlatBufferBuilder &_fbb, const char *key = nullptr, int32_t value_length = 0, - int8_t op = 0) { - auto key__ = key ? _fbb.CreateString(key) : 0; - return CreateTCPPayloadRequest(_fbb, key__, value_length, op); -} - -inline const TCPPayloadRequest *GetTCPPayloadRequest(const void *buf) { - return flatbuffers::GetRoot(buf); -} - -inline const TCPPayloadRequest *GetSizePrefixedTCPPayloadRequest(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); -} - -inline bool VerifyTCPPayloadRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifyBuffer(nullptr); -} - -inline bool VerifySizePrefixedTCPPayloadRequestBuffer(flatbuffers::Verifier &verifier) { - return verifier.VerifySizePrefixedBuffer(nullptr); -} - -inline void FinishTCPPayloadRequestBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.Finish(root); -} - -inline void FinishSizePrefixedTCPPayloadRequestBuffer(flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { - fbb.FinishSizePrefixed(root); -} - -#endif // FLATBUFFERS_GENERATED_TCPPAYLOADREQUEST_H_ From 7050c11a4d7972728e97ac47fc830d2e256b04ac Mon Sep 17 00:00:00 2001 From: thesues Date: Sat, 10 May 2025 19:54:16 +0000 Subject: [PATCH 12/12] add pyproject.toml --- Dockerfile.build | 9 --------- README.md | 4 ++-- build_manylinux_wheels.sh | 4 ++-- pyproject.toml | 9 +++++++++ setup.py | 1 - 5 files changed, 13 insertions(+), 14 deletions(-) create mode 100644 pyproject.toml diff --git a/Dockerfile.build b/Dockerfile.build index f967f69..990dcee 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -44,15 +44,6 @@ RUN rm -rf /tmp/flatbuffers # Install boost RUN dnf install -y boost boost-devel -# Install pybind11 for different versions of built-in python3 by almalinux -RUN /opt/python/cp310-cp310/bin/pip3 install pybind11 -RUN /opt/python/cp311-cp311/bin/pip3 install pybind11 - -# In almalinux, setuptools for python3.12 is not installed -# so install it -RUN /opt/python/cp312-cp312/bin/pip3 install setuptools -RUN /opt/python/cp312-cp312/bin/pip3 install pybind11 - # The above get the build environment ready! WORKDIR /app RUN git config --global --add safe.directory /app diff --git a/README.md b/README.md index 8a2a358..adb4fe4 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ Most users just need to deploy and run InfiniStore, and they don't need to under pip install infinistore ``` -## Install from Source Code +## Install from Source Code for develop For users who need to understand how InfiniStore code works or make code contributions to InfiniStore, it's recommended to install from source code: @@ -43,7 +43,7 @@ apt install libflatbuffers-dev apt install libspdlog-dev libfmt-dev apt install ibverbs-utils libibverbs-dev apt install libboost-dev libboost-stacktrace-dev -pip install -e . +pip install --no-build-isolation -e . pip install pre-commit pre-commit install ``` diff --git a/build_manylinux_wheels.sh b/build_manylinux_wheels.sh index 4f7a198..8cca1a1 100644 --- a/build_manylinux_wheels.sh +++ b/build_manylinux_wheels.sh @@ -12,8 +12,8 @@ OLDPATH=$PATH for PYTHON in "${PYTHON_VERSIONS[@]}"; do BINDIR="$(dirname $PYTHON)" export PATH="$BINDIR:$OLDPATH" - pip install meson ninja - ${PYTHON} setup.py bdist_wheel + + pip wheel -v . --no-deps -w dist/ #runtime will install ibverbs, so exclude it WHEEL_FILE=$(ls dist/*.whl) echo "WHEEL_FILE: ${WHEEL_FILE}" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8c44630 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel", + "meson", + "ninja", + "pybind11", +] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.py b/setup.py index d09e2dd..07386e0 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,6 @@ def run(self): install_requires=[ "uvloop", "fastapi", - "pybind11", "uvicorn", "numpy", ],