diff --git a/src/infragraph/blueprints/fabrics/hybrid_debruijn_fabric.py b/src/infragraph/blueprints/fabrics/hybrid_debruijn_fabric.py new file mode 100644 index 0000000..be0a681 --- /dev/null +++ b/src/infragraph/blueprints/fabrics/hybrid_debruijn_fabric.py @@ -0,0 +1,134 @@ +from infragraph import * +from infragraph.infragraph_service import InfraGraphService +import itertools + +class HybridDeBruijnFabric(Infrastructure): + """ + A Hybrid of De Bruijn Fabric and Clos Fabric with Access Layer of Rack Switches + + Inputs: + switch : fabric switch device + server : host device + order : order of DeBruijn graph + + Derived: + switch_port.count = switch radix + degree = switch_port.count / 8 + + Fabric Switch Ports: + 0..degree-1 primary outgoing + degree..2degree-1 redundant outgoing + 2degree..3degree-1 primary incoming + 3degree..4degree-1 redundant incoming + 4degree..switch_port.count-1 access switch uplinks + + Access Switch Ports: + 0..(switch_port.count/2 -1) hosts + remaining fabric switch connection + unused + + """ + + def __init__(self, switch: Device, server: Device, order: int): + super().__init__( + name="hybrid-debruijn-fabric", + description=f"DeBruijn Fabric With Rack Switches(order={order})", + ) + + switch_port = InfraGraphService.get_component(switch, Component.PORT) + host_nic = InfraGraphService.get_component(server, Component.NIC) + + # The switch radix must divide evenly across the full port plan: + # half the ports are for fabric links and half are for host/access links; + # within the fabric half, ports are split into incoming and outgoing; + # within both incoming and outgoing groups, ports are split again into + # primary and redundant links. Therefore switch port must be divisible by 8 (2*2*2) + if switch_port.count % 8 != 0: + raise ValueError("Switch radix must be divisible by 8") + + # degree of graph = connected neighbour nodes + degree = switch_port.count // 8 + host_ports = switch_port.count // 2 + + if degree < 1: + raise ValueError("Not enough switch ports") + + if host_ports % host_nic.count != 0: + raise ValueError("Host NIC count must divide available host ports") + + # Each access switch dedicates half of its ports to hosts + # host count is based on NICs per host + hosts_per_access_switch = host_ports // host_nic.count + + self.devices.append(switch) + self.devices.append(server) + + # Build de bruijn node labels. + # For degree d and order n, the fabric has d^n switches, each having unique label + alphabet = [str(i) for i in range(degree)] + nodes = ["".join(p) for p in itertools.product(alphabet, repeat=order)] + num_switches = len(nodes) + + # Create one fabric switch and one access switch per de bruijn node + fabric_switches = self.instances.add(name="fabric_switch", device=switch.name, count=num_switches) + access_switches = self.instances.add(name="access_switch", device=switch.name, count=num_switches) + + # Create Hosts per access/rack switch + total_hosts = num_switches * hosts_per_access_switch + hosts = self.instances.add(name="host", device=server.name, count=total_hosts) + node_index = {node: i for i, node in enumerate(nodes)} + + # Create links + # fabric link connects fabric switches + # access links connects fabric switch and access switch + fabric_link = self.links.add(name="fabric-link", description="DeBruijn fabric connectivity") + fabric_link.physical.bandwidth.gigabits_per_second = 400 + access_link = self.links.add(name="access-uplink", description="Access switch to fabric switch") + access_link.physical.bandwidth.gigabits_per_second = 200 + host_link = self.links.add(name="host-link", description="Host to access switch") + host_link.physical.bandwidth.gigabits_per_second = 100 + + # Added de bruijn fabric edges + # Routing - shifting node label left and appending each alphabet digit of destination node + # two types of link - primary link, redundant link + for node in nodes: + src_idx = node_index[node] + for i, digit in enumerate(alphabet): + next_node = node[1:] + digit + dst_idx = node_index[next_node] + + # primary link + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name) + edge.ep1.instance = f"{fabric_switches.name}[{src_idx}]" + edge.ep1.component = f"{switch_port.name}[{i}]" + edge.ep2.instance = f"{fabric_switches.name}[{dst_idx}]" + edge.ep2.component = f"{switch_port.name}[{i + 2*degree}]" + + # redundant link + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name) + edge.ep1.instance = f"{fabric_switches.name}[{src_idx}]" + edge.ep1.component = f"{switch_port.name}[{i + degree}]" + edge.ep2.instance = f"{fabric_switches.name}[{dst_idx}]" + edge.ep2.component = f"{switch_port.name}[{i + 3*degree}]" + + # Added access switch to fabric switch edges + uplink_start = 4 * degree + for idx in range(num_switches): + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=access_link.name) + edge.ep1.instance = f"{access_switches.name}[{idx}]" + edge.ep1.component = f"{switch_port.name}[0]" + edge.ep2.instance = f"{fabric_switches.name}[{idx}]" + edge.ep2.component = f"{switch_port.name}[{uplink_start}]" + + # Attach hosts to access switch + host_index = 0 + for sw_idx in range(num_switches): + for h in range(hosts_per_access_switch): + for nic in range(host_nic.count): + port_index = h * host_nic.count + nic + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=host_link.name) + edge.ep1.instance = f"{hosts.name}[{host_index}]" + edge.ep1.component = f"{host_nic.name}[{nic}]" + edge.ep2.instance = f"{access_switches.name}[{sw_idx}]" + edge.ep2.component = f"{switch_port.name}[{port_index}]" + host_index += 1 + diff --git a/src/infragraph/blueprints/fabrics/multi_host_debruijn_fabric.py b/src/infragraph/blueprints/fabrics/multi_host_debruijn_fabric.py new file mode 100644 index 0000000..3a7ac29 --- /dev/null +++ b/src/infragraph/blueprints/fabrics/multi_host_debruijn_fabric.py @@ -0,0 +1,104 @@ +from infragraph import * +from infragraph.infragraph_service import InfraGraphService +import itertools + + +class MultiHostDeBruijnFabric(Infrastructure): + """ + DeBruijn Fabric with Multiple Hosts per Switch + Redundant Fabric Links + + Inputs + switch : fabric switch + server : host device + order : DeBruijn order + + Ports + 0..d-1 primary outgoing + d..2d-1 redundant outgoing + 2d..3d-1 primary incoming + 3d..4d-1 redundant incoming + 4d..switch_port.count-1 host ports + """ + + def __init__(self, switch: Device, server: Device, order: int): + super().__init__( + name="multi-host-redundant-debruijn", + description=f"DeBruijn Fabric (k={order})", + ) + + switch_port = InfraGraphService.get_component(switch, Component.PORT) + host_nic = InfraGraphService.get_component(server, Component.NIC) + + degree = switch_port.count // 8 + host_ports = switch_port.count // 2 + + if degree < 1: + raise ValueError("Not enough switch ports") + + if host_ports % host_nic.count != 0: + raise ValueError( + f"Host NICs ({host_nic.count}) must divide available host ports ({host_ports})" + ) + + # Each access switch dedicates half of its ports to hosts + # host count is based on NICs per host + hosts_per_switch = host_ports // host_nic.count + + self.devices.append(switch) + self.devices.append(server) + + # Build de bruijn node labels. + # For degree d and order n, the fabric has d^n switches, each having unique label + alphabet = [str(i) for i in range(degree)] + nodes = ["".join(p) for p in itertools.product(alphabet, repeat=order)] + num_switches = len(nodes) + + # Create fabric switches and Hosts + switches = self.instances.add(name="switch", device=switch.name, count=num_switches) + hosts = self.instances.add(name="host", device=server.name, count=num_switches * hosts_per_switch) + node_index = {node: i for i, node in enumerate(nodes)} + + # Added links + # fabric link connects fabric switches + # host links connects hosts with fabric switches + fabric_link = self.links.add(name="fabric-link", description="DeBruijn connectivity") + fabric_link.physical.bandwidth.gigabits_per_second = 400 + host_link = self.links.add(name="host-link", description="Host to switch connectivity") + host_link.physical.bandwidth.gigabits_per_second = 100 + + # Added de bruijn fabric edges + # Routing - shifting node label left and appending each alphabet digit of destination node + # two types of link - primary link, redundant link + for node in nodes: + src_idx = node_index[node] + for i, digit in enumerate(alphabet): + next_node = node[1:] + digit + dst_idx = node_index[next_node] + + # Primary link + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name) + edge.ep1.instance = f"{switches.name}[{src_idx}]" + edge.ep1.component = f"{switch_port.name}[{i}]" + edge.ep2.instance = f"{switches.name}[{dst_idx}]" + edge.ep2.component = f"{switch_port.name}[{i + 2*degree}]" + + # Redundant link + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=fabric_link.name) + edge.ep1.instance = f"{switches.name}[{src_idx}]" + edge.ep1.component = f"{switch_port.name}[{i + degree}]" + edge.ep2.instance = f"{switches.name}[{dst_idx}]" + edge.ep2.component = f"{switch_port.name}[{i + 3*degree}]" + + # Attach hosts to access switch + host_port_start = 4 * degree + host_global_idx = 0 + for sw_idx in range(num_switches): + for h in range(hosts_per_switch): + for nic in range(host_nic.count): + port_offset = h * host_nic.count + nic + edge = self.edges.add(scheme=InfrastructureEdge.ONE2ONE, link=host_link.name) + edge.ep1.instance = f"{hosts.name}[{host_global_idx}]" + edge.ep1.component = f"{host_nic.name}[{nic}]" + edge.ep2.instance = f"{switches.name}[{sw_idx}]" + edge.ep2.component = (f"{switch_port.name}[{host_port_start + port_offset}]") + host_global_idx += 1 diff --git a/src/tests/test_blueprints/test_hybrid_debruijn_fabric.py b/src/tests/test_blueprints/test_hybrid_debruijn_fabric.py new file mode 100644 index 0000000..35bae23 --- /dev/null +++ b/src/tests/test_blueprints/test_hybrid_debruijn_fabric.py @@ -0,0 +1,51 @@ +from infragraph.infragraph_service import InfraGraphService +from infragraph.blueprints.devices.generic.server import Server +from infragraph.blueprints.devices.generic.generic_switch import Switch +from infragraph.blueprints.fabrics.hybrid_debruijn_fabric import HybridDeBruijnFabric +from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX +import networkx +import pytest + +DGX_PROFILES = [ + "dgx1", + "dgx2", + "dgx_a100", + "dgx_h100", + "dgx_gb200", +] +@pytest.mark.asyncio +async def test_hybrid_debruijn_fabric(): + """ + Generate a hybrid debruijn fabric + + """ + switch = Switch(port_count=16) + server = Server() + fabric = HybridDeBruijnFabric(switch, server, 3) + + service = InfraGraphService() + service.set_graph(fabric) + + graph = service.get_networkx_graph() + print(networkx.write_network_text(graph, vertical_chains=True)) + +@pytest.mark.asyncio +@pytest.mark.parametrize("dgx_profile", DGX_PROFILES) +async def test_hybrid_debruijn_fabric_with_dgx(dgx_profile): + """ + Generate a hybrid debruijn fabric with each supported DGX device + + """ + switch = Switch(port_count=16) + dgx = NvidiaDGX(dgx_profile) + fabric = HybridDeBruijnFabric(switch, dgx, 3) + + service = InfraGraphService() + service.set_graph(fabric) + + graph = service.get_networkx_graph() + print(networkx.write_network_text(graph, vertical_chains=True)) + + +if __name__ == "__main__": + pytest.main(["-s", __file__]) diff --git a/src/tests/test_blueprints/test_multi_host_debruijn_fabric.py b/src/tests/test_blueprints/test_multi_host_debruijn_fabric.py new file mode 100644 index 0000000..051bf6c --- /dev/null +++ b/src/tests/test_blueprints/test_multi_host_debruijn_fabric.py @@ -0,0 +1,51 @@ +from infragraph.infragraph_service import InfraGraphService +from infragraph.blueprints.devices.generic.server import Server +from infragraph.blueprints.devices.generic.generic_switch import Switch +from infragraph.blueprints.fabrics.multi_host_debruijn_fabric import MultiHostDeBruijnFabric +from infragraph.blueprints.devices.nvidia.dgx import NvidiaDGX +import networkx +import pytest + +DGX_PROFILES = [ + "dgx1", + "dgx2", + "dgx_a100", + "dgx_h100", + "dgx_gb200", +] +@pytest.mark.asyncio +async def test_debruijn_multi_host_fabric(): + """ + Generate a debruijn fabric with multiple hosts per switch + + """ + switch = Switch(port_count=16) + server = Server() + fabric = MultiHostDeBruijnFabric(switch, server, 3) + + service = InfraGraphService() + service.set_graph(fabric) + + graph = service.get_networkx_graph() + print(networkx.write_network_text(graph, vertical_chains=True)) + +@pytest.mark.asyncio +@pytest.mark.parametrize("dgx_profile", DGX_PROFILES) +async def test_debruijn_multi_host_fabric_with_dgx(dgx_profile): + """ + Generate a debruijn fabric with each supported DGX device + + """ + switch = Switch(port_count=16) + dgx = NvidiaDGX(dgx_profile) + fabric = MultiHostDeBruijnFabric(switch, dgx, 3) + + service = InfraGraphService() + service.set_graph(fabric) + + graph = service.get_networkx_graph() + print(networkx.write_network_text(graph, vertical_chains=True)) + + +if __name__ == "__main__": + pytest.main(["-s", __file__])