Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions tests/cpp/test_multidevice_lower_communication_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class LowerCollectiveCudaAndNcclTest
at::Tensor runBenchmark(
MultiDeviceExecutor& executor,
const std::vector<c10::IValue>& inputs,
int64_t msg_size_bytes,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int64_t message_size_bytes,
CommunicatorBackend backend_type,
const std::string& test_name,
float bandwidth_multiplier = 1.0f,
Expand Down Expand Up @@ -126,11 +126,12 @@ class LowerCollectiveCudaAndNcclTest
// Print results on rank 0
if (communicator_->deviceId() == 0) {
float mean_cpu_time_ms = time_tensor.item<float>();
float cpu_bandwidth_gbps = (msg_size_bytes * bandwidth_multiplier /
float cpu_bandwidth_gbps = (message_size_bytes * bandwidth_multiplier /
(mean_cpu_time_ms / 1000.0)) /
1e9;
std::cout << test_name << " - Backend: " << backend_type
<< ", Size: " << (msg_size_bytes / (1024.0 * 1024.0)) << " MB"
<< ", Size: " << (message_size_bytes / (1024.0 * 1024.0))
<< " MB"
<< ", Avg CPU time: " << mean_cpu_time_ms << " ms"
<< ", CPU Bandwidth: " << cpu_bandwidth_gbps << " GB/s"
<< std::endl;
Expand Down Expand Up @@ -177,8 +178,8 @@ class LowerCollectiveCudaAndNcclTest
};

TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) {
const auto& [msg_size_bytes, protocol_enum] = GetParam();
const int64_t kMsgSize = msg_size_bytes / sizeof(float);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const auto& [message_size_bytes, protocol_enum] = GetParam();
const int64_t message_size = message_size_bytes / sizeof(float);
const CommunicatorBackend backend_type = getBackend(protocol_enum);
const std::string protocol_str = getProtocolString(protocol_enum);

Expand All @@ -192,6 +193,10 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) {
GTEST_SKIP() << "Device does not support Multicast; skipping.";
}

if (message_size_bytes > 32LL * 1024 * 1024) {
GTEST_SKIP() << "Takes >30 seconds to run in CI: http://nv/e.)";
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skips here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

syntax: malformed URL in skip message - http://nv/e.) appears incomplete

Suggested change
GTEST_SKIP() << "Takes >30 seconds to run in CI: http://nv/e.)";
GTEST_SKIP() << "Takes >30 seconds to run in CI";

}

// cudaMemcpyBatchAsync requires a non-default stream
c10::cuda::CUDAStream stream =
c10::cuda::getStreamFromPool(/*isHighPriority=*/false);
Expand Down Expand Up @@ -219,7 +224,7 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) {
in->axis(0)->parallelize(ParallelType::DIDx);

at::Tensor unsharded_tensor =
at::randn({num_devices, kMsgSize}, tensor_options_);
at::randn({num_devices, message_size}, tensor_options_);
at::Tensor in_tensor = shardTensor(unsharded_tensor, in);

MultiDeviceExecutorParams params;
Expand All @@ -232,7 +237,7 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) {
at::Tensor out_tensor = runBenchmark(
executor,
{in_tensor},
msg_size_bytes,
message_size_bytes,
backend_type,
"Allgather/" + protocol_str,
static_cast<float>(communicator_->size()));
Expand All @@ -241,10 +246,10 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Allgather) {
}

TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) {
const auto& [msg_size_bytes, protocol_enum] = GetParam();
const auto& [message_size_bytes, protocol_enum] = GetParam();
const CommunicatorBackend backend_type = getBackend(protocol_enum);
const std::string protocol_str = getProtocolString(protocol_enum);
const int64_t kMsgSize = msg_size_bytes / sizeof(float);
const int64_t message_size = message_size_bytes / sizeof(float);

if (!communicator_->is_available() || communicator_->size() < 2) {
GTEST_SKIP() << "This test needs at least 2 ranks.";
Expand All @@ -256,6 +261,10 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) {
GTEST_SKIP() << "Device does not support Multicast; skipping.";
}

if (message_size_bytes > 32LL * 1024 * 1024) {
GTEST_SKIP() << "Takes >5 seconds to run in CI: http://nv/e.)";
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... and here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

syntax: malformed URL in skip message - http://nv/e.) appears incomplete

Suggested change
GTEST_SKIP() << "Takes >5 seconds to run in CI: http://nv/e.)";
GTEST_SKIP() << "Takes >5 seconds to run in CI";

}

// cudaMemcpyBatchAsync requires a non-default stream
c10::cuda::CUDAStream stream =
c10::cuda::getStreamFromPool(/*isHighPriority=*/false);
Expand Down Expand Up @@ -289,15 +298,15 @@ TEST_P(LowerCollectiveCudaAndNcclTest, Broadcast) {
std::move(fusion), Communicator::getInstance(), params);

at::Tensor unsharded_tensor =
at::randn({num_devices, kMsgSize}, tensor_options_);
at::randn({num_devices, message_size}, tensor_options_);
const auto device_id = communicator_->deviceId();
at::Tensor in_tensor = unsharded_tensor.slice(0, device_id, device_id + 1);

// Run benchmark and validate correctness
at::Tensor out_tensor = runBenchmark(
executor,
{in_tensor},
msg_size_bytes,
message_size_bytes,
backend_type,
"Broadcast/" + protocol_str,
1.0f);
Expand All @@ -310,10 +319,10 @@ namespace {
std::string paramToStringLowerCollectiveCudaAndNcclTest(
const testing::TestParamInfo<std::tuple<int64_t, CommunicationProtocol>>&
info) {
const auto& [msg_size_bytes, protocol_enum] = info.param;
const auto& [message_size_bytes, protocol_enum] = info.param;
std::stringstream ss;
ss << getProtocolString(protocol_enum) << "_";
int64_t size_mb = msg_size_bytes / (1024 * 1024);
int64_t size_mb = message_size_bytes / (1024 * 1024);
if (size_mb >= 1024) {
ss << (size_mb / 1024) << "GB";
} else {
Expand Down