Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions src/infragraph/blueprints/fabrics/hybrid_debruijn_fabric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from infragraph import *
from infragraph.infragraph_service import InfraGraphService
import itertools

class HybridDeBruijnFabric(Infrastructure):
"""
A Hybrid of De Bruijn Fabric and Clos Fabric with Access Layer of Rack Switches

Inputs:
switch : fabric switch device
server : host device
order : order of DeBruijn graph

Derived:
switch_port.count = switch radix
degree = switch_port.count / 8

Fabric Switch Ports:
0..degree-1 primary outgoing
degree..2degree-1 redundant outgoing
2degree..3degree-1 primary incoming
3degree..4degree-1 redundant incoming
4degree..switch_port.count-1 access switch uplinks

Access Switch Ports:
0..(switch_port.count/2 -1) hosts
remaining fabric switch connection + unused

"""

def __init__(self, switch: Device, server: Device, order: int):
super().__init__(
name="hybrid-debruijn-fabric",
description=f"DeBruijn Fabric With Rack Switches(order={order})",
)

switch_port = InfraGraphService.get_component(switch, Component.PORT)
host_nic = InfraGraphService.get_component(server, Component.NIC)

# The switch radix must divide evenly across the full port plan:
# half the ports are for fabric links and half are for host/access links;
# within the fabric half, ports are split into incoming and outgoing;
# within both incoming and outgoing groups, ports are split again into
# primary and redundant links. Therefore switch port must be divisible by 8 (2*2*2)
if switch_port.count % 8 != 0:
Comment thread
Monjistha99 marked this conversation as resolved.
raise ValueError("Switch radix must be divisible by 8")

# degree of graph = connected neighbour nodes
degree = switch_port.count // 8
host_ports = switch_port.count // 2

if degree < 1:
raise ValueError("Not enough switch ports")

if host_ports % host_nic.count != 0:
raise ValueError("Host NIC count must divide available host ports")

# Each access switch dedicates half of its ports to hosts
# host count is based on NICs per host
hosts_per_access_switch = host_ports // host_nic.count

self.devices.append(switch)
self.devices.append(server)

# Build de bruijn node labels.
# For degree d and order n, the fabric has d^n switches, each having unique label
alphabet = [str(i) for i in range(degree)]
nodes = ["".join(p) for p in itertools.product(alphabet, repeat=order)]
num_switches = len(nodes)

# Create one fabric switch and one access switch per de bruijn node
fabric_switches = self.instances.add(name="fabric_switch", device=switch.name, count=num_switches)
access_switches = self.instances.add(name="access_switch", device=switch.name, count=num_switches)

# Create Hosts per access/rack switch
total_hosts = num_switches * hosts_per_access_switch
hosts = self.instances.add(name="host", device=server.name, count=total_hosts)
node_index = {node: i for i, node in enumerate(nodes)}

# Create links
# fabric link connects fabric switches
# access links connects fabric switch and access switch
fabric_link = self.links.add(name="fabric-link", description="DeBruijn fabric connectivity")
fabric_link.physical.bandwidth.gigabits_per_second = 400
access_link = self.links.add(name="access-uplink", description="Access switch to fabric switch")
access_link.physical.bandwidth.gigabits_per_second = 200
host_link = self.links.add(name="host-link", description="Host to access switch")
host_link.physical.bandwidth.gigabits_per_second = 100

# Added de bruijn fabric edges
# Routing - shifting node label left and appending each alphabet digit of destination node
# two types of link - primary link, redundant link
for node in nodes:
src_idx = node_index[node]
for i, digit in enumerate(alphabet):
next_node = node[1:] + digit
dst_idx = node_index[next_node]

# primary link
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name)
edge.ep1.instance = f"{fabric_switches.name}[{src_idx}]"
edge.ep1.component = f"{switch_port.name}[{i}]"
edge.ep2.instance = f"{fabric_switches.name}[{dst_idx}]"
edge.ep2.component = f"{switch_port.name}[{i + 2*degree}]"

# redundant link
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name)
edge.ep1.instance = f"{fabric_switches.name}[{src_idx}]"
edge.ep1.component = f"{switch_port.name}[{i + degree}]"
edge.ep2.instance = f"{fabric_switches.name}[{dst_idx}]"
edge.ep2.component = f"{switch_port.name}[{i + 3*degree}]"

# Added access switch to fabric switch edges
uplink_start = 4 * degree
for idx in range(num_switches):
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=access_link.name)
edge.ep1.instance = f"{access_switches.name}[{idx}]"
edge.ep1.component = f"{switch_port.name}[0]"
edge.ep2.instance = f"{fabric_switches.name}[{idx}]"
edge.ep2.component = f"{switch_port.name}[{uplink_start}]"

# Attach hosts to access switch
host_index = 0
for sw_idx in range(num_switches):
for h in range(hosts_per_access_switch):
for nic in range(host_nic.count):
port_index = h * host_nic.count + nic
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=host_link.name)
edge.ep1.instance = f"{hosts.name}[{host_index}]"
edge.ep1.component = f"{host_nic.name}[{nic}]"
edge.ep2.instance = f"{access_switches.name}[{sw_idx}]"
edge.ep2.component = f"{switch_port.name}[{port_index}]"
host_index += 1

104 changes: 104 additions & 0 deletions src/infragraph/blueprints/fabrics/multi_host_debruijn_fabric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from infragraph import *
from infragraph.infragraph_service import InfraGraphService
import itertools


class MultiHostDeBruijnFabric(Infrastructure):
"""
DeBruijn Fabric with Multiple Hosts per Switch + Redundant Fabric Links

Inputs
switch : fabric switch
server : host device
order : DeBruijn order

Ports
0..d-1 primary outgoing
d..2d-1 redundant outgoing
2d..3d-1 primary incoming
3d..4d-1 redundant incoming
4d..switch_port.count-1 host ports
"""

def __init__(self, switch: Device, server: Device, order: int):
super().__init__(
name="multi-host-redundant-debruijn",
description=f"DeBruijn Fabric (k={order})",
)

switch_port = InfraGraphService.get_component(switch, Component.PORT)
host_nic = InfraGraphService.get_component(server, Component.NIC)

degree = switch_port.count // 8
host_ports = switch_port.count // 2

if degree < 1:
raise ValueError("Not enough switch ports")

if host_ports % host_nic.count != 0:
raise ValueError(
f"Host NICs ({host_nic.count}) must divide available host ports ({host_ports})"
)

# Each access switch dedicates half of its ports to hosts
# host count is based on NICs per host
hosts_per_switch = host_ports // host_nic.count

self.devices.append(switch)
self.devices.append(server)

# Build de bruijn node labels.
# For degree d and order n, the fabric has d^n switches, each having unique label
alphabet = [str(i) for i in range(degree)]
nodes = ["".join(p) for p in itertools.product(alphabet, repeat=order)]
num_switches = len(nodes)

# Create fabric switches and Hosts
switches = self.instances.add(name="switch", device=switch.name, count=num_switches)
hosts = self.instances.add(name="host", device=server.name, count=num_switches * hosts_per_switch)
node_index = {node: i for i, node in enumerate(nodes)}

# Added links
# fabric link connects fabric switches
# host links connects hosts with fabric switches
fabric_link = self.links.add(name="fabric-link", description="DeBruijn connectivity")
fabric_link.physical.bandwidth.gigabits_per_second = 400
host_link = self.links.add(name="host-link", description="Host to switch connectivity")
host_link.physical.bandwidth.gigabits_per_second = 100

# Added de bruijn fabric edges
# Routing - shifting node label left and appending each alphabet digit of destination node
# two types of link - primary link, redundant link
for node in nodes:
src_idx = node_index[node]
for i, digit in enumerate(alphabet):
next_node = node[1:] + digit
dst_idx = node_index[next_node]

# Primary link
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name)
edge.ep1.instance = f"{switches.name}[{src_idx}]"
edge.ep1.component = f"{switch_port.name}[{i}]"
edge.ep2.instance = f"{switches.name}[{dst_idx}]"
edge.ep2.component = f"{switch_port.name}[{i + 2*degree}]"

# Redundant link
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name)
edge.ep1.instance = f"{switches.name}[{src_idx}]"
edge.ep1.component = f"{switch_port.name}[{i + degree}]"
edge.ep2.instance = f"{switches.name}[{dst_idx}]"
edge.ep2.component = f"{switch_port.name}[{i + 3*degree}]"

# Attach hosts to access switch
host_port_start = 4 * degree
host_global_idx = 0
for sw_idx in range(num_switches):
for h in range(hosts_per_switch):
for nic in range(host_nic.count):
port_offset = h * host_nic.count + nic
edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=host_link.name)
edge.ep1.instance = f"{hosts.name}[{host_global_idx}]"
edge.ep1.component = f"{host_nic.name}[{nic}]"
edge.ep2.instance = f"{switches.name}[{sw_idx}]"
edge.ep2.component = (f"{switch_port.name}[{host_port_start + port_offset}]")
host_global_idx += 1
51 changes: 51 additions & 0 deletions src/tests/test_blueprints/test_hybrid_debruijn_fabric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from infragraph.infragraph_service import InfraGraphService
from infragraph.blueprints.devices.generic.server import Server
from infragraph.blueprints.devices.generic.generic_switch import Switch
from infragraph.blueprints.fabrics.hybrid_debruijn_fabric import HybridDeBruijnFabric
from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX
import networkx
import pytest

DGX_PROFILES = [
"dgx1",
"dgx2",
"dgx_a100",
"dgx_h100",
"dgx_gb200",
]
@pytest.mark.asyncio
async def test_hybrid_debruijn_fabric():
"""
Generate a hybrid debruijn fabric

"""
switch = Switch(port_count=16)
server = Server()
fabric = HybridDeBruijnFabric(switch, server, 3)

service = InfraGraphService()
service.set_graph(fabric)

graph = service.get_networkx_graph()
print(networkx.write_network_text(graph, vertical_chains=True))

@pytest.mark.asyncio
@pytest.mark.parametrize("dgx_profile", DGX_PROFILES)
async def test_hybrid_debruijn_fabric_with_dgx(dgx_profile):
"""
Generate a hybrid debruijn fabric with each supported DGX device

"""
switch = Switch(port_count=16)
dgx = NvidiaDGX(dgx_profile)
fabric = HybridDeBruijnFabric(switch, dgx, 3)

service = InfraGraphService()
service.set_graph(fabric)

graph = service.get_networkx_graph()
print(networkx.write_network_text(graph, vertical_chains=True))


if __name__ == "__main__":
pytest.main(["-s", __file__])
51 changes: 51 additions & 0 deletions src/tests/test_blueprints/test_multi_host_debruijn_fabric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from infragraph.infragraph_service import InfraGraphService
from infragraph.blueprints.devices.generic.server import Server
from infragraph.blueprints.devices.generic.generic_switch import Switch
from infragraph.blueprints.fabrics.multi_host_debruijn_fabric import MultiHostDeBruijnFabric
from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX
import networkx
import pytest

DGX_PROFILES = [
"dgx1",
"dgx2",
"dgx_a100",
"dgx_h100",
"dgx_gb200",
]
@pytest.mark.asyncio
async def test_debruijn_multi_host_fabric():
"""
Generate a debruijn fabric with multiple hosts per switch

"""
switch = Switch(port_count=16)
server = Server()
fabric = MultiHostDeBruijnFabric(switch, server, 3)

service = InfraGraphService()
service.set_graph(fabric)

graph = service.get_networkx_graph()
print(networkx.write_network_text(graph, vertical_chains=True))

@pytest.mark.asyncio
@pytest.mark.parametrize("dgx_profile", DGX_PROFILES)
async def test_debruijn_multi_host_fabric_with_dgx(dgx_profile):
"""
Generate a debruijn fabric with each supported DGX device

"""
switch = Switch(port_count=16)
dgx = NvidiaDGX(dgx_profile)
fabric = MultiHostDeBruijnFabric(switch, dgx, 3)

service = InfraGraphService()
service.set_graph(fabric)

graph = service.get_networkx_graph()
print(networkx.write_network_text(graph, vertical_chains=True))


if __name__ == "__main__":
pytest.main(["-s", __file__])
Loading