diff --git a/tests/cpp/test_multidevice_lower_communication_cuda.cpp b/tests/cpp/test_multidevice_lower_communication_cuda.cpp index 3164f193132..5b7a0ea874b 100644 --- a/tests/cpp/test_multidevice_lower_communication_cuda.cpp +++ b/tests/cpp/test_multidevice_lower_communication_cuda.cpp @@ -80,7 +80,7 @@ class LowerCollectiveCudaAndNcclTest at::Tensor runBenchmark( MultiDeviceExecutor& executor, const std::vector& inputs, - int64_t msg_size_bytes, + int64_t message_size_bytes, CommunicatorBackend backend_type, const std::string& test_name, float bandwidth_multiplier = 1.0f, @@ -126,11 +126,12 @@ class LowerCollectiveCudaAndNcclTest // Print results on rank 0 if (communicator_->deviceId() == 0) { float mean_cpu_time_ms = time_tensor.item(); - float cpu_bandwidth_gbps = (msg_size_bytes * bandwidth_multiplier / + float cpu_bandwidth_gbps = (message_size_bytes * bandwidth_multiplier / (mean_cpu_time_ms / 1000.0)) / 1e9; std::cout << test_name << " - Backend: " << backend_type - << ", Size: " << (msg_size_bytes / (1024.0 * 1024.0)) << " MB" + << ", Size: " << (message_size_bytes / (1024.0 * 1024.0)) + << " MB" << ", Avg CPU time: " << mean_cpu_time_ms << " ms" << ", CPU Bandwidth: " << cpu_bandwidth_gbps << " GB/s" << std::endl; @@ -177,8 +178,8 @@ class LowerCollectiveCudaAndNcclTest }; TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) { - const auto& [msg_size_bytes, protocol_enum] = GetParam(); - const int64_t kMsgSize = msg_size_bytes / sizeof(float); + const auto& [message_size_bytes, protocol_enum] = GetParam(); + const int64_t message_size = message_size_bytes / sizeof(float); const CommunicatorBackend backend_type = getBackend(protocol_enum); const std::string protocol_str = getProtocolString(protocol_enum); @@ -192,6 +193,10 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) { GTEST_SKIP() << "Device does not support Multicast; skipping."; } + if (message_size_bytes > 32LL * 1024 * 1024) { + GTEST_SKIP() << "Takes >30 seconds to run in CI: http://nv/e.)"; + } + // cudaMemcpyBatchAsync requires a non-default stream c10::cuda::CUDAStream stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false); @@ -219,7 +224,7 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) { in->axis(0)->parallelize(ParallelType::DIDx); at::Tensor unsharded_tensor = - at::randn({num_devices, kMsgSize}, tensor_options_); + at::randn({num_devices, message_size}, tensor_options_); at::Tensor in_tensor = shardTensor(unsharded_tensor, in); MultiDeviceExecutorParams params; @@ -232,7 +237,7 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) { at::Tensor out_tensor = runBenchmark( executor, {in_tensor}, - msg_size_bytes, + message_size_bytes, backend_type, "Allgather/" + protocol_str, static_cast(communicator_->size())); @@ -241,10 +246,10 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) { } TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) { - const auto& [msg_size_bytes, protocol_enum] = GetParam(); + const auto& [message_size_bytes, protocol_enum] = GetParam(); const CommunicatorBackend backend_type = getBackend(protocol_enum); const std::string protocol_str = getProtocolString(protocol_enum); - const int64_t kMsgSize = msg_size_bytes / sizeof(float); + const int64_t message_size = message_size_bytes / sizeof(float); if (!communicator_->is_available() || communicator_->size() < 2) { GTEST_SKIP() << "This test needs at least 2 ranks."; @@ -256,6 +261,10 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) { GTEST_SKIP() << "Device does not support Multicast; skipping."; } + if (message_size_bytes > 32LL * 1024 * 1024) { + GTEST_SKIP() << "Takes >5 seconds to run in CI: http://nv/e.)"; + } + // cudaMemcpyBatchAsync requires a non-default stream c10::cuda::CUDAStream stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/false); @@ -289,7 +298,7 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) { std::move(fusion), Communicator::getInstance(), params); at::Tensor unsharded_tensor = - at::randn({num_devices, kMsgSize}, tensor_options_); + at::randn({num_devices, message_size}, tensor_options_); const auto device_id = communicator_->deviceId(); at::Tensor in_tensor = unsharded_tensor.slice(0, device_id, device_id + 1); @@ -297,7 +306,7 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) { at::Tensor out_tensor = runBenchmark( executor, {in_tensor}, - msg_size_bytes, + message_size_bytes, backend_type, "Broadcast/" + protocol_str, 1.0f); @@ -310,10 +319,10 @@ namespace { std::string paramToStringLowerCollectiveCudaAndNcclTest( const testing::TestParamInfo>& info) { - const auto& [msg_size_bytes, protocol_enum] = info.param; + const auto& [message_size_bytes, protocol_enum] = info.param; std::stringstream ss; ss << getProtocolString(protocol_enum) << "_"; - int64_t size_mb = msg_size_bytes / (1024 * 1024); + int64_t size_mb = message_size_bytes / (1024 * 1024); if (size_mb >= 1024) { ss << (size_mb / 1024) << "GB"; } else {