diff --git a/gigl/distributed/distributed_neighborloader.py b/gigl/distributed/distributed_neighborloader.py index cfdd59d6f..2be71c055 100644 --- a/gigl/distributed/distributed_neighborloader.py +++ b/gigl/distributed/distributed_neighborloader.py @@ -388,7 +388,7 @@ def _setup_for_graph_store( # Get sampling ports for compute-storage connections. sampling_ports = dataset.get_free_ports_on_storage_cluster( - num_ports=dataset.cluster_info.num_processes_per_compute + num_ports=dataset.cluster_info.num_compute_nodes ) sampling_port = sampling_ports[node_rank] diff --git a/tests/integration/distributed/graph_store/graph_store_integration_test.py b/tests/integration/distributed/graph_store/graph_store_integration_test.py index 3b5074d34..955d6239d 100644 --- a/tests/integration/distributed/graph_store/graph_store_integration_test.py +++ b/tests/integration/distributed/graph_store/graph_store_integration_test.py @@ -499,16 +499,28 @@ def _get_expected_input_nodes_by_rank( ) expected_sampler_input = collections.defaultdict(list) for server_rank in range(cluster_info.num_storage_nodes): - server_nodes = get_ids_on_rank(partition_book, server_rank) + server_nodes = get_ids_on_rank(partition_book=partition_book, rank=server_rank) for compute_rank in range(cluster_info.num_compute_nodes): generated_nodes = shard_nodes_by_process( - server_nodes, compute_rank, cluster_info.num_processes_per_compute + input_nodes=server_nodes, + local_process_rank=compute_rank, + local_process_world_size=cluster_info.num_compute_nodes, ) expected_sampler_input[compute_rank].append(generated_nodes) return dict(expected_sampler_input) class GraphStoreIntegrationTest(TestCase): + """ + NOTE: Since these tests run on cloud build, + and our python process memory footprint is quite large due to tf, torch, etc, + We need to be careful to not spawn too many processes. + Otherwise we will OOM and see "myterious" failures like the below: + make: *** [Makefile:119: integration_test] Error 137 + ERROR: build step 0 "docker-img/path:tag" failed: step exited with non-zero status: 2 + ERROR: build step 0 "docker-img/path:tag" failed: step exited with non-zero status: 2 + """ + def test_graph_store_homogeneous(self): # Simulating two server machine, two compute machines. # Each machine has one process. @@ -625,7 +637,7 @@ def test_homogeneous_training(self): cluster_info = GraphStoreInfo( num_storage_nodes=2, num_compute_nodes=2, - num_processes_per_compute=2, + num_processes_per_compute=1, cluster_master_ip=host_ip, storage_cluster_master_ip=host_ip, compute_cluster_master_ip=host_ip, diff --git a/tests/test_assets/test_case.py b/tests/test_assets/test_case.py index d7ed45b38..eddc762ec 100644 --- a/tests/test_assets/test_case.py +++ b/tests/test_assets/test_case.py @@ -12,7 +12,7 @@ logger = Logger() -DEFAULT_TIMEOUT_SECONDS: Final[float] = 300.0 +DEFAULT_TIMEOUT_SECONDS: Final[float] = 60.0 * 10 # 10 minutes DEFAULT_POLL_INTERVAL_SECONDS: Final[float] = 0.1