Skip to content

Commit 64b8c07

Browse files
authored
bench: Initial cuda.bindings latency benchmarks structure (#1736)
1 parent 56edbb0 commit 64b8c07

File tree

19 files changed

+2421
-0
lines changed

19 files changed

+2421
-0
lines changed

.github/workflows/test-wheel-linux.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,14 @@ jobs:
261261
LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
262262
run: run-tests bindings
263263

264+
- name: Run cuda.bindings benchmarks (smoke test)
265+
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
266+
run: |
267+
pip install pyperf
268+
pushd cuda_bindings/benchmarks
269+
python run_pyperf.py --fast --loops 1 --min-time 0
270+
popd
271+
264272
- name: Run cuda.core tests
265273
env:
266274
CUDA_VER: ${{ matrix.CUDA_VER }}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Build artifacts
2+
.build/
3+
__pycache__/
4+
5+
# Benchmark results
6+
*.json
7+
.benchmarks/
8+
9+
# Pixi environments
10+
.pixi/
11+
12+
# Override root .gitignore *.cpp rule (which targets Cython-generated files)
13+
!benchmarks/cpp/*.cpp

cuda_bindings/benchmarks/README.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# cuda.bindings Benchmarks
2+
3+
## Usage
4+
5+
Requires pixi.
6+
7+
There are a couple of environments defined based on how `cuda.bindings` is installed:
8+
9+
- `wheel`: Installs from conda packages
10+
- `source`: Installs from source
11+
12+
There are a couple of tasks defined:
13+
14+
- `bench`: Runs the Python benchmarks
15+
- `bench-cpp`: Runs the C++ benchmarks
16+
17+
### System tuning
18+
19+
For more stable results on Linux, tune the system before running benchmarks.
20+
See: https://pyperf.readthedocs.io/en/latest/system.html#system
21+
22+
```bash
23+
# Show current system state
24+
pixi run -e wheel -- python -m pyperf system show
25+
26+
# Apply tuning (may require root)
27+
sudo $(pixi run -e wheel -- which python) -m pyperf system tune
28+
```
29+
30+
### Running benchmarks
31+
32+
To run the benchmarks combine the environment and task:
33+
34+
```bash
35+
36+
# Run the Python benchmarks in the wheel environment
37+
pixi run -e wheel bench
38+
39+
# Run the Python benchmarks in the source environment
40+
pixi run -e source bench
41+
42+
# Run the C++ benchmarks (environment is irrelavant here)
43+
pixi run -e wheel bench-cpp
44+
```
45+
46+
## pyperf JSON
47+
48+
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
49+
The results are written to a JSON file in the format expected by pyperf.
50+
51+
The C++ benchmarks also generate a valid JSON file, in the same format.
52+
53+
```
54+
pixi run -e wheel bench-cpp -0 cpp.json
55+
56+
pixi run -e wheel pyperf stats cpp.json
57+
```
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import alloc_persistent
8+
9+
from cuda.bindings import driver as cuda
10+
11+
# Allocate memory used by the tests
12+
PTR = alloc_persistent(1 << 18)
13+
ATTRIBUTE = cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
14+
15+
16+
def bench_pointer_get_attribute(loops: int) -> float:
17+
# Local references to avoid global lookups in the hot loop
18+
_cuPointerGetAttribute = cuda.cuPointerGetAttribute
19+
_attr = ATTRIBUTE
20+
_ptr = PTR
21+
22+
t0 = time.perf_counter()
23+
for _ in range(loops):
24+
_cuPointerGetAttribute(_attr, _ptr)
25+
return time.perf_counter() - t0
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
cmake_minimum_required(VERSION 3.24)
6+
project(cuda_bindings_cpp_benchmarks LANGUAGES CXX)
7+
8+
set(CMAKE_CXX_STANDARD 17)
9+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
10+
set(CMAKE_CXX_EXTENSIONS OFF)
11+
12+
set(CUDA_HOME_HINT "$ENV{CUDA_HOME}")
13+
set(CONDA_PREFIX_HINT "$ENV{CONDA_PREFIX}")
14+
15+
# Find cuda.h (driver API header)
16+
find_path(
17+
CUDA_DRIVER_INCLUDE_DIR
18+
cuda.h
19+
HINTS
20+
"${CUDA_HOME_HINT}/include"
21+
"${CONDA_PREFIX_HINT}/targets/x86_64-linux/include"
22+
"${CONDA_PREFIX_HINT}/include"
23+
)
24+
25+
# Find libcuda (driver API library) — lives on the system, not in toolkit
26+
find_library(
27+
CUDA_DRIVER_LIBRARY
28+
NAMES cuda
29+
HINTS
30+
"/usr/lib/x86_64-linux-gnu"
31+
"/usr/lib64"
32+
"${CUDA_HOME_HINT}/lib64/stubs"
33+
"${CUDA_HOME_HINT}/lib/stubs"
34+
"${CONDA_PREFIX_HINT}/targets/x86_64-linux/lib/stubs"
35+
"${CONDA_PREFIX_HINT}/lib/stubs"
36+
)
37+
38+
if(NOT CUDA_DRIVER_INCLUDE_DIR)
39+
message(FATAL_ERROR "Could not find cuda.h. Ensure CUDA_HOME is set or install cuda-crt-dev.")
40+
endif()
41+
42+
if(NOT CUDA_DRIVER_LIBRARY)
43+
message(FATAL_ERROR "Could not find libcuda. Ensure the NVIDIA driver is installed.")
44+
endif()
45+
46+
add_executable(bench_pointer_attributes_cpp bench_pointer_attributes.cpp)
47+
target_include_directories(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_INCLUDE_DIR}")
48+
target_link_libraries(bench_pointer_attributes_cpp PRIVATE "${CUDA_DRIVER_LIBRARY}")
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#include <cuda.h>
6+
7+
#include "bench_support.hpp"
8+
9+
#include <cstdlib>
10+
#include <iostream>
11+
12+
13+
static void check_cu(CUresult status, const char* message) {
14+
if (status != CUDA_SUCCESS) {
15+
const char* error_name = nullptr;
16+
cuGetErrorName(status, &error_name);
17+
std::cerr << message << ": " << (error_name ? error_name : "unknown") << '\n';
18+
std::exit(1);
19+
}
20+
}
21+
22+
23+
int main(int argc, char** argv) {
24+
bench::Options options = bench::parse_args(argc, argv);
25+
if (options.benchmark_name.empty()) {
26+
options.benchmark_name = "cpp.pointer_attributes.pointer_get_attribute";
27+
}
28+
29+
// Setup: init CUDA, allocate memory
30+
check_cu(cuInit(0), "cuInit failed");
31+
32+
CUdevice device;
33+
check_cu(cuDeviceGet(&device, 0), "cuDeviceGet failed");
34+
35+
CUcontext ctx;
36+
CUctxCreateParams ctxParams = {};
37+
check_cu(cuCtxCreate(&ctx, &ctxParams, 0, device), "cuCtxCreate failed");
38+
39+
CUdeviceptr ptr;
40+
check_cu(cuMemAlloc(&ptr, 1 << 18), "cuMemAlloc failed");
41+
42+
unsigned int memory_type = 0;
43+
44+
// Run benchmark
45+
auto results = bench::run_benchmark(options, [&]() {
46+
check_cu(
47+
cuPointerGetAttribute(&memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr),
48+
"cuPointerGetAttribute failed"
49+
);
50+
});
51+
52+
// Sanity check: the call actually did something
53+
if (memory_type == 0) {
54+
std::cerr << "unexpected memory_type=0\n";
55+
}
56+
57+
// Cleanup
58+
check_cu(cuMemFree(ptr), "cuMemFree failed");
59+
check_cu(cuCtxDestroy(ctx), "cuCtxDestroy failed");
60+
61+
// Output
62+
bench::print_summary(options.benchmark_name, results);
63+
64+
if (!options.output_path.empty()) {
65+
bench::write_pyperf_json(options.output_path, options.benchmark_name, options.loops, results);
66+
}
67+
68+
return 0;
69+
}

0 commit comments

Comments
 (0)