From 84db9b46dcbba5a85d65377c7c1638440947f9e5 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Thu, 5 Feb 2026 18:11:23 +0000 Subject: [PATCH 1/7] Fetch proper number of free ports for Graph Store mode --- gigl/distributed/distributed_neighborloader.py | 2 +- .../distributed/graph_store/graph_store_integration_test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gigl/distributed/distributed_neighborloader.py b/gigl/distributed/distributed_neighborloader.py index ded32b724..8ab8697ef 100644 --- a/gigl/distributed/distributed_neighborloader.py +++ b/gigl/distributed/distributed_neighborloader.py @@ -387,7 +387,7 @@ def _setup_for_graph_store( # Get sampling ports for compute-storage connections. sampling_ports = dataset.get_free_ports_on_storage_cluster( - num_ports=dataset.cluster_info.num_processes_per_compute + num_ports=dataset.cluster_info.num_compute_nodes ) sampling_port = sampling_ports[node_rank] diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index 865006184..8d19d7277 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -441,7 +441,7 @@ def _get_expected_input_nodes_by_rank( server_nodes = get_ids_on_rank(partition_book, server_rank) for compute_rank in range(cluster_info.num_compute_nodes): generated_nodes = shard_nodes_by_process( - server_nodes, compute_rank, cluster_info.num_processes_per_compute + server_nodes, compute_rank, cluster_info.num_compute_nodes ) expected_sampler_input[compute_rank].append(generated_nodes) return dict(expected_sampler_input) @@ -466,7 +466,7 @@ def test_graph_store_homogeneous(self): host_ip = socket.gethostbyname(socket.gethostname()) cluster_info = GraphStoreInfo( num_storage_nodes=2, - num_compute_nodes=2, + num_compute_nodes=4, num_processes_per_compute=2, cluster_master_ip=host_ip, storage_cluster_master_ip=host_ip, From c85cbdb94bf923a75892e573bb4156bdd1df9354 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Thu, 5 Feb 2026 19:50:51 +0000 Subject: [PATCH 2/7] Fix --- .../distributed/graph_store/graph_store_integration_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index d6710d313..93187ff91 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -499,10 +499,12 @@ def _get_expected_input_nodes_by_rank( ) expected_sampler_input = collections.defaultdict(list) for server_rank in range(cluster_info.num_storage_nodes): - server_nodes = get_ids_on_rank(partition_book, server_rank) + server_nodes = get_ids_on_rank(partition_book=partition_book, rank=server_rank) for compute_rank in range(cluster_info.num_compute_nodes): generated_nodes = shard_nodes_by_process( - server_nodes, compute_rank, cluster_info.num_compute_nodes + input_nodes=server_nodes, + local_process_rank=compute_rank, + local_process_world_size=cluster_info.num_compute_nodes, ) expected_sampler_input[compute_rank].append(generated_nodes) return dict(expected_sampler_input) From 3a6adf71b439ed83312fbdd867df0b52cb60e488 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Fri, 6 Feb 2026 05:51:05 +0000 Subject: [PATCH 3/7] try fix --- .../distributed/graph_store/graph_store_integration_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index 93187ff91..1dff26911 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -626,7 +626,7 @@ def test_homogeneous_training(self): host_ip = socket.gethostbyname(socket.gethostname()) cluster_info = GraphStoreInfo( num_storage_nodes=2, - num_compute_nodes=2, + num_compute_nodes=4, num_processes_per_compute=2, cluster_master_ip=host_ip, storage_cluster_master_ip=host_ip, From e5a2225a9a04807961c8a08329043a44e10fc775 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Fri, 6 Feb 2026 16:54:27 +0000 Subject: [PATCH 4/7] maybe? --- tests/test_assets/test_case.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_assets/test_case.py b/tests/test_assets/test_case.py index d7ed45b38..eddc762ec 100644 --- a/tests/test_assets/test_case.py +++ b/tests/test_assets/test_case.py @@ -12,7 +12,7 @@ logger = Logger() -DEFAULT_TIMEOUT_SECONDS: Final[float] = 300.0 +DEFAULT_TIMEOUT_SECONDS: Final[float] = 60.0 * 10 # 10 minutes DEFAULT_POLL_INTERVAL_SECONDS: Final[float] = 0.1 From 7498cf98182846d7d829c1599f2c04fec34dd298 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Fri, 6 Feb 2026 17:39:24 +0000 Subject: [PATCH 5/7] smaller world size --- .../distributed/graph_store/graph_store_integration_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index 1dff26911..1c7ecf17f 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -529,7 +529,7 @@ def test_graph_store_homogeneous(self): host_ip = socket.gethostbyname(socket.gethostname()) cluster_info = GraphStoreInfo( num_storage_nodes=2, - num_compute_nodes=4, + num_compute_nodes=2, num_processes_per_compute=2, cluster_master_ip=host_ip, storage_cluster_master_ip=host_ip, @@ -626,8 +626,8 @@ def test_homogeneous_training(self): host_ip = socket.gethostbyname(socket.gethostname()) cluster_info = GraphStoreInfo( num_storage_nodes=2, - num_compute_nodes=4, - num_processes_per_compute=2, + num_compute_nodes=2, + num_processes_per_compute=1, cluster_master_ip=host_ip, storage_cluster_master_ip=host_ip, compute_cluster_master_ip=host_ip, From ef29a6118a76744bab2f4813340a1e04628be642 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Fri, 6 Feb 2026 17:42:13 +0000 Subject: [PATCH 6/7] Add note --- .../graph_store/graph_store_integration_test.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index 1c7ecf17f..14625b4f1 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -511,6 +511,15 @@ def _get_expected_input_nodes_by_rank( class GraphStoreIntegrationTest(TestCase): + """ + NOTE: Since these tests run on cloud build, + and our python process memory footprint is quite large due to tf, torch, etc, + We need to be careful to not spawn too many processes. + Otherwise we will OOM and see "myterious" failures like the below: + make: *** [Makefile:119: integration_test] Error 137 + ERROR: build step 0 "docker-img/path:tag" failed: step exited with non-zero status: 2 + ERROR: build step 0 "docker-img/path:tag" failed: step exited with non-zero status: 2 + """ def test_graph_store_homogeneous(self): # Simulating two server machine, two compute machines. # Each machine has one process. From 09f891bd38145cc7bb6daa395eeb68e8ea948bcb Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Fri, 6 Feb 2026 18:00:42 +0000 Subject: [PATCH 7/7] fomrat --- .../distributed/graph_store/graph_store_integration_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index 14625b4f1..955d6239d 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -520,6 +520,7 @@ class GraphStoreIntegrationTest(TestCase): ERROR: build step 0 "docker-img/path:tag" failed: step exited with non-zero status: 2 ERROR: build step 0 "docker-img/path:tag" failed: step exited with non-zero status: 2 """ + def test_graph_store_homogeneous(self): # Simulating two server machine, two compute machines. # Each machine has one process.