NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 23 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 59 additions & 0 deletions b/‎README.md‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎common.h‎
Lines changed: 19 additions & 1 deletion b/‎common.h‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎detect_cuda_arch.cmake‎
Lines changed: 25 additions & 1 deletion b/‎detect_cuda_arch.cmake‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎environment.cpp‎
Lines changed: 72 additions & 0 deletions b/‎environment.cpp‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎environment.h‎
Lines changed: 70 additions & 0 deletions b/‎environment.h‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎error_handling.h‎
Lines changed: 5 additions & 5 deletions b/‎error_handling.h‎
Lines changed: 5 additions & 5 deletions
@@ -1,25 +1,42 @@
 cmake_minimum_required(VERSION 3.20)
 
+# Check if user had set the target CUDA_ARCHITECTURE
+if(DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CUDA_ARCH_DEFINED_BY_USER TRUE)
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES set by user: ${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+
 project(nvbandwidth
     LANGUAGES CUDA CXX)
-
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
+# Unless it's set by the user, ignore CMAKE_CUDA_ARCHITECTURES
+if(NOT CUDA_ARCH_DEFINED_BY_USER)
+    unset(CMAKE_CUDA_ARCHITECTURES)
+    unset(CMAKE_CUDA_ARCHITECTURES CACHE)
+endif()
+
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
     # 5.2 architecture not supported since CUDA 13.0
-    set(supported_archs "70" "75" "80" "86" "89" "90" "100")
+    set(supported_archs "75" "80" "86" "89" "90" "100")
 else ()
     set(supported_archs "52" "70" "75" "80" "86" "89" "90" "100")
 endif()
 
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    message(STATUS "Detecting underlying CUDA Arch to set CMAKE_CUDA_ARCHITECTURES")
+    message(STATUS "Detecting underlying CUDA Arch from GPU to set CMAKE_CUDA_ARCHITECTURES")
     include(detect_cuda_arch.cmake)
     # Set CMAKE_CUDA_ARCHITECURES based on the underlying device
-    cuda_detect_architectures(supported_archs CMAKE_CUDA_ARCHITECTURES)
+    cuda_detect_architectures_from_gpu(supported_archs CMAKE_CUDA_ARCHITECTURES)
+endif()
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  message(STATUS "Detecting underlying CUDA Arch from nvcc to set CMAKE_CUDA_ARCHITECTURES")
+  # Set CMAKE_CUDA_ARCHITECURES based on the underlying device
+  cuda_detect_architectures_from_nvcc(supported_archs CMAKE_CUDA_ARCHITECTURES)
 endif()
 
 if(NOT CMAKE_BUILD_TYPE)
@@ -29,7 +46,7 @@ endif()
 if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
     file(READ "/etc/os-release" OS_RELEASE_CONTENT)
     # Skip static libs on Fedora - https://github.com/NVIDIA/nvbandwidth/issues/4
-    if(NOT OS_RELEASE_CONTENT MATCHES "ID=.*fedora")
+    if(NOT OS_RELEASE_CONTENT MATCHES "ID=.*fedora|azurelinux")
         set(Boost_USE_STATIC_LIBS ON)
     endif()
 else()
@@ -38,6 +55,7 @@ endif()
 find_package(Boost COMPONENTS program_options REQUIRED)
 
 set(src
+    environment.cpp
     testcase.cpp
     testcases_ce.cpp
     testcases_sm.cpp
 
@@ -59,9 +59,11 @@ nvbandwidth CLI:
   -s [ --skipVerification ]      Skips data verification after copy
   -d [ --disableAffinity ]       Disable automatic CPU affinity control
   -i [ --testSamples ] arg (=3)  Iterations of the benchmark
+  -P [ --targetNumPairs ] arg (=-1) Target pairs for multinode device-to-device tests (-1: all,  >0: sampled)
   -m [ --useMean ]               Use mean instead of median for results
   -j [ --json ]                  Print output in json format instead of plain
                                  text.
+  -H [--useHugePages]            Use huge pages for host memory allocation
 ```
 To run all testcases:
 ```
@@ -106,6 +108,45 @@ Specify the IP addresses of the cluster nodes in /etc/nvidia-imex/nodes_config.c
 For example, to run multinode bandwidth on a system with 2 nodes and 4 GPUs per node run the command:
 `mpirun --allow-run-as-root --map-by ppr:4:node --bind-to core -np 8 --report-bindings -q -mca btl_tcp_if_include enP5p9s0 --hostfile /etc/nvidia-imex/nodes_config.cfg  ./nvbandwidth -p multinode`
 
+
+### Pair Sampling for Multinode Tests
+
+For large multinode systems, testing all possible GPU pairs can be time-consuming and resource-intensive. nvbandwidth provides a sampling option to reduce test time while maintaining good coverage of the GPU topology.
+
+#### Sampling Options
+
+- **`--targetNumPairs -1`** (default): Test all possible pairs (N×(N-1) for N GPUs)
+- **`--targetNumPairs <number>`**: Test exactly `<number>` pairs using intelligent sampling
+
+For example, to run multinode bandwidth on a system with 4 nodes and 8 GPUs per node, and select 8 pairs of GPUs run the command:
+`mpirun --allow-run-as-root --map-by ppr:4:node --bind-to core -np 8 --report-bindings -q -mca btl_tcp_if_include enP5p9s0 --hostfile /etc/nvidia-imex/nodes_config.cfg  ./nvbandwidth -p multinode  --targetNumPairs 8`
+
+The command selects random GPU pairs, maximizing coverage of unique GPUs. Bandwidth (BW) for untested pairs will be set to NA.
+
+#### Sampling Algorithm
+
+When using sampling (`targetNumPairs > 0` and less than total pairs), nvbandwidth employs a two-phase approach:
+
+1. **GPU Coverage Phase**: Ensures each GPU participates in at least one test pair
+2. **Random Filling Phase**: Fills remaining slots with random pairs, prioritizing uncovered GPUs
+
+This approach maximizes GPU topology coverage even with limited test pairs.
+
+#### Examples
+
+```bash
+# Test all pairs (default/full coverage)
+mpirun -n 8 ./nvbandwidth -p multinode --targetNumPairs -1
+
+# Test 20 carefully selected pairs
+mpirun -n 8 ./nvbandwidth -p multinode --targetNumPairs 20
+
+# For a 4-node, 8-GPU system: 992 total pairs available
+# Using --targetNumPairs 100 will test ~10% of pairs while covering all GPUs
+```
+
+**Note**: The `--targetNumPairs` parameter only affects multinode device-to-device tests. In single-node mode, this parameter is ignored and a warning will be displayed if a positive value is specified.
+
 ### Local testing
 
 You can test it on a single-node machine (Ampere+ GPU required):
@@ -141,6 +182,24 @@ SM copies will truncate the copy size to fit uniformly on the target device to c
 
 threadsPerBlock is set to 512.
 
+### Latency Measurements
+
+nvbandwidth uses **pointer chasing** to measure memory latency rather than simple sequential access patterns. This methodology provides more realistic latency measurements by forcing random memory access patterns that prevent prefetching optimizations.
+
+#### How Pointer Chasing Works
+
+1. **Setup**: Memory is organized as a linked list where each node contains a pointer to the next node
+2. **Pattern**: The chain follows a strided pattern through memory: `Node[i] -> Node[(i + stride) % total_nodes]`
+3. **Execution**: The kernel follows this pointer chain for a specified number of accesses
+4. **Measurement**: Total time is divided by number of accesses gives latency per access
+
+#### Important Notes
+
+- **TLB costs are excluded **: from measurements because the same buffer is accessed across all tests, keeping TLB entries cached and eliminating address translation overhead from the latency measurements.
+- **Data cache hits prevented**: Random pointer chasing patterns prevent data cache benefits by design
+
+This approach provides latency measurements that benefit from TLB optimization while maintaining random memory access patterns that prevent data cache effects.
+
 ### Measurement Details
 ![](diagrams/measurement.png)
 
 
@@ -23,6 +23,7 @@
 #include <cuda.h>
 #include <nvml.h>
 #include <float.h>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <ostream>
@@ -56,14 +57,15 @@ extern bool jsonOutput;
 // Verbosity
 extern bool verbose;
 extern bool perfFormatter;
+extern bool useHugePages;
 
 #ifdef MULTINODE
 extern int localDevice;
 extern int localRank;
 extern int worldRank;
 extern int worldSize;
+extern long long targetNumPairs;
 #endif
-extern char localHostname[STRING_LENGTH];
 
 class Verbosity {
  public:
@@ -236,6 +238,22 @@ inline std::string getUnitString(UnitType unitType) {
     }
 }
 
+inline bool hugePagesEnabled() {
+    // Huge pages not supported in Windows version
+#ifndef _WIN32
+    // Check if THP (Transparent Huge Pages) is enabled
+    std::ifstream thp_enabled("/sys/kernel/mm/transparent_hugepage/enabled");
+    if (thp_enabled.is_open()) {
+        std::string line;
+        std::getline(thp_enabled, line);
+        // THP is enabled if "always" or "madvise" is available (indicated by brackets)
+        return (line.find("[always]") != std::string::npos ||
+                line.find("[madvise]") != std::string::npos);
+    }
+#endif
+    return false;
+}
+
 // Describe attributes of a single memcpy operation
 class MemcpyDescriptor {
  public:
 
@@ -5,7 +5,7 @@ include_guard(GLOBAL)
 
 # Adapted from https://github.com/rapidsai/rapids-cmake/blob/branch-24.04/rapids-cmake/cuda/detail/detect_architectures.cmake
 
-function(cuda_detect_architectures possible_archs_var gpu_archs)
+function(cuda_detect_architectures_from_gpu possible_archs_var gpu_archs)
 
   set(__gpu_archs ${${possible_archs_var}})
 
@@ -65,3 +65,27 @@ int main(int argc, char** argv) {
   set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
 
 endfunction()
+
+# Function to detect CUDA architecture without requiring a GPU
+function(cuda_detect_architectures_from_nvcc output_variable)
+    execute_process(
+        COMMAND ${CMAKE_CUDA_COMPILER} --version
+        OUTPUT_VARIABLE NVCC_OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    string(REGEX MATCH "release ([0-9]+)\\.([0-9]+)" NVCC_VERSION "${NVCC_OUT}")
+    set(NVCC_MAJOR ${CMAKE_MATCH_1})
+    set(NVCC_MINOR ${CMAKE_MATCH_2})
+
+    # Base architecture list (Turing and newer)
+    set(ARCH_LIST "75;80;86;89;90;100")
+
+    # Add older architectures only for CUDA < 13.0
+    if(NVCC_MAJOR LESS 13)
+        list(PREPEND ARCH_LIST "52;60;70")  # Maxwell, Pascal, Volta
+        message(STATUS "Including SM52/SM60/SM70 support for CUDA ${NVCC_MAJOR}.${NVCC_MINOR}")
+    endif()
+
+    set(${output_variable} "${ARCH_LIST}" PARENT_SCOPE)
+    message(STATUS "Final architecture list: ${ARCH_LIST}")
+endfunction()
@@ -0,0 +1,72 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                                                                                                                                                  * SPDX-License-Identifier: Apache-2.0                                                                                                                                                                                                               *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "environment.h"
+
+#include "error_handling.h"
+#include <common.h>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+std::unique_ptr<Environment> Environment::create(int argc, char** argv) {
+#ifdef MULTINODE
+    // When compiled with MULTINODE, always use MultiNodeEnv
+    return std::make_unique<MultiNodeEnv>();
+#else
+    return std::make_unique<SingleNodeEnv>();
+#endif
+}
+
+std::string SingleNodeEnv::getHostname() const {
+#ifdef _WIN32
+    char hostname[STRING_LENGTH] = "unknown";
+    strncpy(hostname, getenv("COMPUTERNAME"), STRING_LENGTH - 1);
+    const char* computername = getenv("COMPUTERNAME");
+    if (computername && computername[0] != '\0') {
+        strncpy(hostname, computername, STRING_LENGTH - 1);
+        hostname[STRING_LENGTH - 1] = '\0';
+    }
+#else
+    char hostname[STRING_LENGTH];
+    ASSERT(0 == gethostname(hostname, STRING_LENGTH - 1));
+#endif
+    return hostname;
+}
+
+#ifdef MULTINODE
+void MultiNodeEnv::initialize(int argc, char** argv) {
+    // Always initialize MPI when MULTINODE is defined
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    char host[STRING_LENGTH];
+    gethostname(host, sizeof(host));
+    hostname = host;
+
+    // Get local rank
+    const char* localRankStr = getenv("OMPI_COMM_WORLD_LOCAL_RANK");
+    if (localRankStr) {
+        localRank = atoi(localRankStr);
+    } else {
+        localRank = rank;
+    }
+}
+
+void MultiNodeEnv::finalize() {
+    MPI_Finalize();
+}
+#endif
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef ENVIRONMENT_H_
+#define ENVIRONMENT_H_
+
+#include <string>
+#include <memory>
+
+#ifdef MULTINODE
+#include <mpi.h>
+#endif
+
+class Environment {
+ public:
+    virtual ~Environment() = default;
+    virtual void initialize(int argc, char** argv) = 0;
+    virtual void finalize() = 0;
+    virtual int getRank() const = 0;
+    virtual int getSize() const = 0;
+    virtual int getLocalRank() const = 0;
+    virtual std::string getHostname() const = 0;
+
+    static std::unique_ptr<Environment> create(int argc, char** argv);
+};
+
+class SingleNodeEnv : public Environment {
+ public:
+    void initialize(int argc, char** argv) override {}
+    void finalize() override {}
+    int getRank() const override { return 0; }
+    int getSize() const override { return 1; }
+    int getLocalRank() const override { return 0; }
+    std::string getHostname() const override;
+};
+
+#ifdef MULTINODE
+class MultiNodeEnv : public Environment {
+ private:
+    int rank = 0;
+    int size = 1;
+    int localRank = 0;
+    std::string hostname;
+
+ public:
+    void initialize(int argc, char** argv) override;
+    void finalize() override;
+    int getRank() const override { return rank; }
+    int getSize() const override { return size; }
+    int getLocalRank() const override { return localRank; }
+    std::string getHostname() const override { return hostname; }
+};
+#endif
+
+#endif  // ENVIRONMENT_H_
@@ -18,18 +18,18 @@
 #ifndef ERROR_HANDLING_H_
 #define ERROR_HANDLING_H_
 
+#include <environment.h>
+
 void RecordError(const std::stringstream &errmsg);
 
-#ifdef MULTINODE
-#define HOST_INFO " on " << localHostname << ", rank = " << worldRank
-#else
-#define HOST_INFO ""
-#endif
+extern std::unique_ptr<Environment> env;
 
 #ifdef MULTINODE
 #include <mpi.h>
+#define HOST_INFO " on " << (env ? env->getHostname() : "unknown") << ", rank = " << worldRank
 #define MPI_ABORT MPI_Abort(MPI_COMM_WORLD, 1)
 #else
+#define HOST_INFO ""
 #define MPI_ABORT
 #endif