Skip to content

Commit 4a49bda

Browse files
authored
Merge pull request #62 from NVIDIA/v0.9
Release v0.9 New features: - Add coefficient of variance to bandwidth output statistics - Add huge page support for host memory (disabled on Windows) - Add option to sample pairs in device-to-device tests - Add troubleshooting guide - Unify multinode and single-node execution paths Improvements: - Improve CUDA architecture detection without requiring GPU access - Deprecate Volta (sm_70/sm_72) support for CUDA toolkit >=13.0 Bug fixes: - Fix JSON output aggregation Platform: - Skip Boost static libs on Azure Linux
2 parents 66746a3 + 1aa9e81 commit 4a49bda

19 files changed

Lines changed: 989 additions & 214 deletions

CMakeLists.txt

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,42 @@
11
cmake_minimum_required(VERSION 3.20)
22

3+
# Check if user had set the target CUDA_ARCHITECTURE
4+
if(DEFINED CMAKE_CUDA_ARCHITECTURES)
5+
set(CUDA_ARCH_DEFINED_BY_USER TRUE)
6+
message(STATUS "CMAKE_CUDA_ARCHITECTURES set by user: ${CMAKE_CUDA_ARCHITECTURES}")
7+
endif()
8+
39
project(nvbandwidth
410
LANGUAGES CUDA CXX)
5-
611
set(CMAKE_CXX_STANDARD 17)
712
set(CMAKE_CXX_STANDARD_REQUIRED ON)
813
set(CMAKE_CUDA_STANDARD 17)
914
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
1015

16+
# Unless it's set by the user, ignore CMAKE_CUDA_ARCHITECTURES
17+
if(NOT CUDA_ARCH_DEFINED_BY_USER)
18+
unset(CMAKE_CUDA_ARCHITECTURES)
19+
unset(CMAKE_CUDA_ARCHITECTURES CACHE)
20+
endif()
21+
1122
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
1223
# 5.2 architecture not supported since CUDA 13.0
13-
set(supported_archs "70" "75" "80" "86" "89" "90" "100")
24+
set(supported_archs "75" "80" "86" "89" "90" "100")
1425
else ()
1526
set(supported_archs "52" "70" "75" "80" "86" "89" "90" "100")
1627
endif()
1728

1829
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
19-
message(STATUS "Detecting underlying CUDA Arch to set CMAKE_CUDA_ARCHITECTURES")
30+
message(STATUS "Detecting underlying CUDA Arch from GPU to set CMAKE_CUDA_ARCHITECTURES")
2031
include(detect_cuda_arch.cmake)
2132
# Set CMAKE_CUDA_ARCHITECURES based on the underlying device
22-
cuda_detect_architectures(supported_archs CMAKE_CUDA_ARCHITECTURES)
33+
cuda_detect_architectures_from_gpu(supported_archs CMAKE_CUDA_ARCHITECTURES)
34+
endif()
35+
36+
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
37+
message(STATUS "Detecting underlying CUDA Arch from nvcc to set CMAKE_CUDA_ARCHITECTURES")
38+
# Set CMAKE_CUDA_ARCHITECURES based on the underlying device
39+
cuda_detect_architectures_from_nvcc(supported_archs CMAKE_CUDA_ARCHITECTURES)
2340
endif()
2441

2542
if(NOT CMAKE_BUILD_TYPE)
@@ -29,7 +46,7 @@ endif()
2946
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
3047
file(READ "/etc/os-release" OS_RELEASE_CONTENT)
3148
# Skip static libs on Fedora - https://github.com/NVIDIA/nvbandwidth/issues/4
32-
if(NOT OS_RELEASE_CONTENT MATCHES "ID=.*fedora")
49+
if(NOT OS_RELEASE_CONTENT MATCHES "ID=.*fedora|azurelinux")
3350
set(Boost_USE_STATIC_LIBS ON)
3451
endif()
3552
else()
@@ -38,6 +55,7 @@ endif()
3855
find_package(Boost COMPONENTS program_options REQUIRED)
3956

4057
set(src
58+
environment.cpp
4159
testcase.cpp
4260
testcases_ce.cpp
4361
testcases_sm.cpp

README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,11 @@ nvbandwidth CLI:
5959
-s [ --skipVerification ] Skips data verification after copy
6060
-d [ --disableAffinity ] Disable automatic CPU affinity control
6161
-i [ --testSamples ] arg (=3) Iterations of the benchmark
62+
-P [ --targetNumPairs ] arg (=-1) Target pairs for multinode device-to-device tests (-1: all, >0: sampled)
6263
-m [ --useMean ] Use mean instead of median for results
6364
-j [ --json ] Print output in json format instead of plain
6465
text.
66+
-H [--useHugePages] Use huge pages for host memory allocation
6567
```
6668
To run all testcases:
6769
```
@@ -106,6 +108,45 @@ Specify the IP addresses of the cluster nodes in /etc/nvidia-imex/nodes_config.c
106108
For example, to run multinode bandwidth on a system with 2 nodes and 4 GPUs per node run the command:
107109
`mpirun --allow-run-as-root --map-by ppr:4:node --bind-to core -np 8 --report-bindings -q -mca btl_tcp_if_include enP5p9s0 --hostfile /etc/nvidia-imex/nodes_config.cfg ./nvbandwidth -p multinode`
108110

111+
112+
### Pair Sampling for Multinode Tests
113+
114+
For large multinode systems, testing all possible GPU pairs can be time-consuming and resource-intensive. nvbandwidth provides a sampling option to reduce test time while maintaining good coverage of the GPU topology.
115+
116+
#### Sampling Options
117+
118+
- **`--targetNumPairs -1`** (default): Test all possible pairs (N×(N-1) for N GPUs)
119+
- **`--targetNumPairs <number>`**: Test exactly `<number>` pairs using intelligent sampling
120+
121+
For example, to run multinode bandwidth on a system with 4 nodes and 8 GPUs per node, and select 8 pairs of GPUs run the command:
122+
`mpirun --allow-run-as-root --map-by ppr:4:node --bind-to core -np 8 --report-bindings -q -mca btl_tcp_if_include enP5p9s0 --hostfile /etc/nvidia-imex/nodes_config.cfg ./nvbandwidth -p multinode --targetNumPairs 8`
123+
124+
The command selects random GPU pairs, maximizing coverage of unique GPUs. Bandwidth (BW) for untested pairs will be set to NA.
125+
126+
#### Sampling Algorithm
127+
128+
When using sampling (`targetNumPairs > 0` and less than total pairs), nvbandwidth employs a two-phase approach:
129+
130+
1. **GPU Coverage Phase**: Ensures each GPU participates in at least one test pair
131+
2. **Random Filling Phase**: Fills remaining slots with random pairs, prioritizing uncovered GPUs
132+
133+
This approach maximizes GPU topology coverage even with limited test pairs.
134+
135+
#### Examples
136+
137+
```bash
138+
# Test all pairs (default/full coverage)
139+
mpirun -n 8 ./nvbandwidth -p multinode --targetNumPairs -1
140+
141+
# Test 20 carefully selected pairs
142+
mpirun -n 8 ./nvbandwidth -p multinode --targetNumPairs 20
143+
144+
# For a 4-node, 8-GPU system: 992 total pairs available
145+
# Using --targetNumPairs 100 will test ~10% of pairs while covering all GPUs
146+
```
147+
148+
**Note**: The `--targetNumPairs` parameter only affects multinode device-to-device tests. In single-node mode, this parameter is ignored and a warning will be displayed if a positive value is specified.
149+
109150
### Local testing
110151

111152
You can test it on a single-node machine (Ampere+ GPU required):
@@ -141,6 +182,24 @@ SM copies will truncate the copy size to fit uniformly on the target device to c
141182

142183
threadsPerBlock is set to 512.
143184

185+
### Latency Measurements
186+
187+
nvbandwidth uses **pointer chasing** to measure memory latency rather than simple sequential access patterns. This methodology provides more realistic latency measurements by forcing random memory access patterns that prevent prefetching optimizations.
188+
189+
#### How Pointer Chasing Works
190+
191+
1. **Setup**: Memory is organized as a linked list where each node contains a pointer to the next node
192+
2. **Pattern**: The chain follows a strided pattern through memory: `Node[i] -> Node[(i + stride) % total_nodes]`
193+
3. **Execution**: The kernel follows this pointer chain for a specified number of accesses
194+
4. **Measurement**: Total time is divided by number of accesses gives latency per access
195+
196+
#### Important Notes
197+
198+
- **TLB costs are excluded **: from measurements because the same buffer is accessed across all tests, keeping TLB entries cached and eliminating address translation overhead from the latency measurements.
199+
- **Data cache hits prevented**: Random pointer chasing patterns prevent data cache benefits by design
200+
201+
This approach provides latency measurements that benefit from TLB optimization while maintaining random memory access patterns that prevent data cache effects.
202+
144203
### Measurement Details
145204
![](diagrams/measurement.png)
146205

common.h

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <cuda.h>
2424
#include <nvml.h>
2525
#include <float.h>
26+
#include <fstream>
2627
#include <iomanip>
2728
#include <iostream>
2829
#include <ostream>
@@ -56,14 +57,15 @@ extern bool jsonOutput;
5657
// Verbosity
5758
extern bool verbose;
5859
extern bool perfFormatter;
60+
extern bool useHugePages;
5961

6062
#ifdef MULTINODE
6163
extern int localDevice;
6264
extern int localRank;
6365
extern int worldRank;
6466
extern int worldSize;
67+
extern long long targetNumPairs;
6568
#endif
66-
extern char localHostname[STRING_LENGTH];
6769

6870
class Verbosity {
6971
public:
@@ -236,6 +238,22 @@ inline std::string getUnitString(UnitType unitType) {
236238
}
237239
}
238240

241+
inline bool hugePagesEnabled() {
242+
// Huge pages not supported in Windows version
243+
#ifndef _WIN32
244+
// Check if THP (Transparent Huge Pages) is enabled
245+
std::ifstream thp_enabled("/sys/kernel/mm/transparent_hugepage/enabled");
246+
if (thp_enabled.is_open()) {
247+
std::string line;
248+
std::getline(thp_enabled, line);
249+
// THP is enabled if "always" or "madvise" is available (indicated by brackets)
250+
return (line.find("[always]") != std::string::npos ||
251+
line.find("[madvise]") != std::string::npos);
252+
}
253+
#endif
254+
return false;
255+
}
256+
239257
// Describe attributes of a single memcpy operation
240258
class MemcpyDescriptor {
241259
public:

detect_cuda_arch.cmake

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ include_guard(GLOBAL)
55

66
# Adapted from https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/detail/detect_architectures.cmake
77

8-
function(cuda_detect_architectures possible_archs_var gpu_archs)
8+
function(cuda_detect_architectures_from_gpu possible_archs_var gpu_archs)
99

1010
set(__gpu_archs ${${possible_archs_var}})
1111

@@ -65,3 +65,27 @@ int main(int argc, char** argv) {
6565
set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
6666

6767
endfunction()
68+
69+
# Function to detect CUDA architecture without requiring a GPU
70+
function(cuda_detect_architectures_from_nvcc output_variable)
71+
execute_process(
72+
COMMAND ${CMAKE_CUDA_COMPILER} --version
73+
OUTPUT_VARIABLE NVCC_OUT
74+
OUTPUT_STRIP_TRAILING_WHITESPACE
75+
)
76+
string(REGEX MATCH "release ([0-9]+)\\.([0-9]+)" NVCC_VERSION "${NVCC_OUT}")
77+
set(NVCC_MAJOR ${CMAKE_MATCH_1})
78+
set(NVCC_MINOR ${CMAKE_MATCH_2})
79+
80+
# Base architecture list (Turing and newer)
81+
set(ARCH_LIST "75;80;86;89;90;100")
82+
83+
# Add older architectures only for CUDA < 13.0
84+
if(NVCC_MAJOR LESS 13)
85+
list(PREPEND ARCH_LIST "52;60;70") # Maxwell, Pascal, Volta
86+
message(STATUS "Including SM52/SM60/SM70 support for CUDA ${NVCC_MAJOR}.${NVCC_MINOR}")
87+
endif()
88+
89+
set(${output_variable} "${ARCH_LIST}" PARENT_SCOPE)
90+
message(STATUS "Final architecture list: ${ARCH_LIST}")
91+
endfunction()

environment.cpp

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 *
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
#include "environment.h"
17+
18+
#include "error_handling.h"
19+
#include <common.h>
20+
#ifndef _WIN32
21+
#include <unistd.h>
22+
#endif
23+
24+
std::unique_ptr<Environment> Environment::create(int argc, char** argv) {
25+
#ifdef MULTINODE
26+
// When compiled with MULTINODE, always use MultiNodeEnv
27+
return std::make_unique<MultiNodeEnv>();
28+
#else
29+
return std::make_unique<SingleNodeEnv>();
30+
#endif
31+
}
32+
33+
std::string SingleNodeEnv::getHostname() const {
34+
#ifdef _WIN32
35+
char hostname[STRING_LENGTH] = "unknown";
36+
strncpy(hostname, getenv("COMPUTERNAME"), STRING_LENGTH - 1);
37+
const char* computername = getenv("COMPUTERNAME");
38+
if (computername && computername[0] != '\0') {
39+
strncpy(hostname, computername, STRING_LENGTH - 1);
40+
hostname[STRING_LENGTH - 1] = '\0';
41+
}
42+
#else
43+
char hostname[STRING_LENGTH];
44+
ASSERT(0 == gethostname(hostname, STRING_LENGTH - 1));
45+
#endif
46+
return hostname;
47+
}
48+
49+
#ifdef MULTINODE
50+
void MultiNodeEnv::initialize(int argc, char** argv) {
51+
// Always initialize MPI when MULTINODE is defined
52+
MPI_Init(&argc, &argv);
53+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
54+
MPI_Comm_size(MPI_COMM_WORLD, &size);
55+
56+
char host[STRING_LENGTH];
57+
gethostname(host, sizeof(host));
58+
hostname = host;
59+
60+
// Get local rank
61+
const char* localRankStr = getenv("OMPI_COMM_WORLD_LOCAL_RANK");
62+
if (localRankStr) {
63+
localRank = atoi(localRankStr);
64+
} else {
65+
localRank = rank;
66+
}
67+
}
68+
69+
void MultiNodeEnv::finalize() {
70+
MPI_Finalize();
71+
}
72+
#endif

environment.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
19+
#ifndef ENVIRONMENT_H_
20+
#define ENVIRONMENT_H_
21+
22+
#include <string>
23+
#include <memory>
24+
25+
#ifdef MULTINODE
26+
#include <mpi.h>
27+
#endif
28+
29+
class Environment {
30+
public:
31+
virtual ~Environment() = default;
32+
virtual void initialize(int argc, char** argv) = 0;
33+
virtual void finalize() = 0;
34+
virtual int getRank() const = 0;
35+
virtual int getSize() const = 0;
36+
virtual int getLocalRank() const = 0;
37+
virtual std::string getHostname() const = 0;
38+
39+
static std::unique_ptr<Environment> create(int argc, char** argv);
40+
};
41+
42+
class SingleNodeEnv : public Environment {
43+
public:
44+
void initialize(int argc, char** argv) override {}
45+
void finalize() override {}
46+
int getRank() const override { return 0; }
47+
int getSize() const override { return 1; }
48+
int getLocalRank() const override { return 0; }
49+
std::string getHostname() const override;
50+
};
51+
52+
#ifdef MULTINODE
53+
class MultiNodeEnv : public Environment {
54+
private:
55+
int rank = 0;
56+
int size = 1;
57+
int localRank = 0;
58+
std::string hostname;
59+
60+
public:
61+
void initialize(int argc, char** argv) override;
62+
void finalize() override;
63+
int getRank() const override { return rank; }
64+
int getSize() const override { return size; }
65+
int getLocalRank() const override { return localRank; }
66+
std::string getHostname() const override { return hostname; }
67+
};
68+
#endif
69+
70+
#endif // ENVIRONMENT_H_

error_handling.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,18 @@
1818
#ifndef ERROR_HANDLING_H_
1919
#define ERROR_HANDLING_H_
2020

21+
#include <environment.h>
22+
2123
void RecordError(const std::stringstream &errmsg);
2224

23-
#ifdef MULTINODE
24-
#define HOST_INFO " on " << localHostname << ", rank = " << worldRank
25-
#else
26-
#define HOST_INFO ""
27-
#endif
25+
extern std::unique_ptr<Environment> env;
2826

2927
#ifdef MULTINODE
3028
#include <mpi.h>
29+
#define HOST_INFO " on " << (env ? env->getHostname() : "unknown") << ", rank = " << worldRank
3130
#define MPI_ABORT MPI_Abort(MPI_COMM_WORLD, 1)
3231
#else
32+
#define HOST_INFO ""
3333
#define MPI_ABORT
3434
#endif
3535

0 commit comments

Comments
 (0)