Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 54 additions & 6 deletions mlx/distributed/jaccl/lib/jaccl/rdma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,61 @@ const Destination& Connection::info() {

ibv_port_attr port_attr;
ibv().query_port(ctx, 1, &port_attr);
ibv_gid gid;
for (int i = 0; i < port_attr.gid_tbl_len; i++) {
ibv_gid gid = {};

auto try_gid = [&](int i, bool require_ipv4_mapped) -> bool {
if (i < 0 || i >= port_attr.gid_tbl_len) {
return false;
}
ibv_gid tmp;
if (ibv().query_gid(ctx, 1, i, &tmp) == 0) {
if (*(uint64_t*)&tmp.raw[0] == 0 && *(uint16_t*)&tmp.raw[8] == 0 &&
*(uint16_t*)&tmp.raw[10] == 0xffff) {
gid = tmp;
if (ibv().query_gid(ctx, 1, i, &tmp) != 0) {
return false;
}
if (require_ipv4_mapped) {
if (*(uint64_t*)&tmp.raw[0] != 0 || *(uint16_t*)&tmp.raw[8] != 0 ||
*(uint16_t*)&tmp.raw[10] != 0xffff) {
return false;
}
} else {
bool is_zero = true;
for (int j = 0; j < 16; j++) {
if (tmp.raw[j] != 0) {
is_zero = false;
break;
}
}
if (is_zero) {
return false;
}
}
gid = tmp;
return true;
};

// 1. Prefer IPv4-mapped IPv6 GID (RoCE v2 standard).
bool found = false;
for (int i = 0; i < port_attr.gid_tbl_len; i++) {
if (try_gid(i, /*require_ipv4_mapped=*/true)) {
found = true;
break;
}
}

// 2. Fallback for Apple Thunderbolt RDMA, which exposes only link-local
// IPv6 (fe80::...) GIDs. Prefer index 1 (the actual rdma_enX port GID;
// index 0 on Apple TB is typically derived from a non-RDMA interface
// and routes elsewhere, surfacing as RTR errno 60 ETIMEDOUT).
if (!found) {
for (int i : {1, 0}) {
if (try_gid(i, /*require_ipv4_mapped=*/false)) {
found = true;
break;
}
}
}
if (!found) {
for (int i = 2; i < port_attr.gid_tbl_len; i++) {
if (try_gid(i, /*require_ipv4_mapped=*/false)) {
break;
}
}
Expand Down
Loading