Skip to content

Commit 3bee1ff

Browse files
authored
Accept launch parameters as arguments to forward kernel (#36)
So that we can tune them on the fly... Usage: ```python out_img = render_fmbs( scene, blender=blender, confidence=confidence_model, intr=intr, extr=extr, img=image, grid_size=dim3(16, 16), block_size=dim3(16, 16), ) ``` e.g. For 256x256 image on 5090 (the timing isn't really meaningful here other than showing that this works) <img width="1480" height="1730" alt="image" src="https://github.com/user-attachments/assets/9c9aa3e3-6972-4feb-87e6-4b130799b99f" />
1 parent b12a079 commit 3bee1ff

7 files changed

Lines changed: 33 additions & 15 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ set(CMAKE_CXX_STANDARD 20)
1010
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1111

1212
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
13+
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xptxas -O3")
1314

1415
################
1516
# Core Library #

genmetaballs/src/cuda/bindings.cu

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,17 @@ NB_MODULE(_genmetaballs_bindings, m) {
228228
bind_array2d<float, MemoryLocation::HOST>(utils, "CPUFloatArray2D");
229229
bind_array2d<float, MemoryLocation::DEVICE>(utils, "GPUFloatArray2D");
230230

231+
// bind dim3, which is used to specify the launch configuration for the kernel
232+
nb::class_<dim3>(utils, "dim3")
233+
.def(nb::init<uint32_t, uint32_t, uint32_t>(), nb::arg("x") = 1, nb::arg("y") = 1,
234+
nb::arg("z") = 1)
235+
.def_prop_ro("x", [](const dim3& self) { return self.x; })
236+
.def_prop_ro("y", [](const dim3& self) { return self.y; })
237+
.def_prop_ro("z", [](const dim3& self) { return self.z; })
238+
.def("__repr__", [](const dim3& self) {
239+
return nb::str("dim3(x={}, y={}, z={})").format(self.x, self.y, self.z);
240+
});
241+
231242
} // NB_MODULE(_genmetaballs_bindings)
232243

233244
template <typename T, MemoryLocation location>
@@ -319,5 +330,5 @@ void bind_render_fmbs(nb::module_& m, const char* name) {
319330
&render_fmbs<AllGetter<MemoryLocation::DEVICE>, LinearIntersector, Blender, Confidence>,
320331
"Render the given FMB scene into the provided image view", nb::arg("fmbs"),
321332
nb::arg("blender"), nb::arg("confidence"), nb::arg("intr"), nb::arg("extr"),
322-
nb::arg("img"));
333+
nb::arg("img"), nb::arg("grid_size"), nb::arg("block_size"));
323334
}

genmetaballs/src/cuda/core/camera.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ CUDA_CALLABLE PixelCoordRange::Iterator& PixelCoordRange::Iterator::operator++()
2626
return *this;
2727
}
2828

29-
CUDA_CALLABLE bool PixelCoordRange::Sentinel::operator==(const Iterator& it) const {
29+
CUDA_CALLABLE bool operator!=(const PixelCoordRange::Iterator& it,
30+
const PixelCoordRange::Sentinel& sentinel) {
3031
// stop if we reach the end of rows, or if the range is empty
31-
return it.py >= py_end || it.px_start >= it.px_end || it.py_start >= py_end;
32+
return it.py < sentinel.py_end && it.px_start < it.px_end && it.py_start < sentinel.py_end;
3233
}
3334

3435
CUDA_CALLABLE PixelCoordRange::Iterator PixelCoordRange::begin() const {

genmetaballs/src/cuda/core/camera.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ struct PixelCoordRange {
4949
// the Sentinel class only needs to hold the stop value (i.e. final row)
5050
struct Sentinel {
5151
uint32_t py_end;
52-
53-
// stopping criterion: true if current row (py) reaches py_end
54-
CUDA_CALLABLE bool operator==(const Iterator& it) const;
5552
};
5653

54+
// stopping criterion: true if current row (py) reaches py_end
55+
friend CUDA_CALLABLE bool operator!=(const Iterator& it, const Sentinel& sentinel);
56+
5757
// range methods
5858
CUDA_CALLABLE Iterator begin() const;
5959
CUDA_CALLABLE Sentinel end() const;

genmetaballs/src/cuda/core/forward.cuh

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,6 @@
99
#include "image.cuh"
1010
#include "utils.cuh"
1111

12-
// TODO: tune this number
13-
constexpr auto NUM_BLOCKS = dim3(4, 4);
14-
constexpr auto THREADS_PER_BLOCK = dim3(16, 16);
15-
1612
CUDA_CALLABLE PixelCoordRange get_pixel_coords(const dim3 thread_idx, const dim3 block_idx,
1713
const dim3 block_dim, const dim3 grid_dim,
1814
const Intrinsics& intr);
@@ -49,7 +45,8 @@ __global__ void render_kernel(const FMBScene<MemoryLocation::DEVICE>& fmbs, cons
4945
template <typename Getter, typename Intersector, typename Blender, typename Confidence>
5046
void render_fmbs(const FMBScene<MemoryLocation::DEVICE>& fmbs, const Blender& blender,
5147
const Confidence& confidence, const Intrinsics& intr, const Pose& extr,
52-
ImageView<MemoryLocation::DEVICE> img) {
48+
ImageView<MemoryLocation::DEVICE> img, const dim3 grid_size,
49+
const dim3 block_size) {
5350
render_kernel<Getter, Intersector, Blender, Confidence>
54-
<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(fmbs, blender, confidence, intr, extr, img);
51+
<<<grid_size, block_size>>>(fmbs, blender, confidence, intr, extr, img);
5552
}

genmetaballs/src/genmetaballs/core/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
)
1313
from genmetaballs._genmetaballs_bindings.fmb import FMB, CPUFMBScene, GPUFMBScene
1414
from genmetaballs._genmetaballs_bindings.image import CPUImage, GPUImage
15-
from genmetaballs._genmetaballs_bindings.utils import CPUFloatArray2D, GPUFloatArray2D, sigmoid
15+
from genmetaballs._genmetaballs_bindings.utils import (
16+
CPUFloatArray2D,
17+
GPUFloatArray2D,
18+
dim3,
19+
sigmoid,
20+
)
1621

1722
type DeviceType = Literal["cpu", "gpu"]
1823

@@ -83,6 +88,8 @@ def render_fmbs(
8388
intr: Intrinsics,
8489
extr: geometry.Pose,
8590
img: GPUImage | None = None,
91+
grid_size: dim3 = dim3(4, 4),
92+
block_size: dim3 = dim3(16, 16),
8693
) -> GPUImage:
8794
"""Render the given FMB scene into the provided image view.
8895
@@ -105,7 +112,7 @@ def render_fmbs(
105112
else:
106113
raise TypeError("Unsupported blender and confidence combination.")
107114

108-
render_func(fmbs, blender, confidence, intr, extr, img.as_view())
115+
render_func(fmbs, blender, confidence, intr, extr, img.as_view(), grid_size, block_size)
109116
return img
110117

111118

@@ -122,6 +129,7 @@ def render_fmbs(
122129
"Camera",
123130
"FourParameterBlender",
124131
"FMB",
132+
"dim3",
125133
"Intrinsics",
126134
"ThreeParameterBlender",
127135
"TwoParameterConfidence",

tests/cpp_tests/test_confidence.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ TEST(GpuConfidenceTest, ConfidenceMultipleValuesGPU_AllTypes) {
101101

102102
std::vector<float> actual;
103103
if (conf_case.is_two_param) {
104-
TwoParameterConfidence conf(conf_case.beta4, conf_case.beta5);
104+
TwoParameterConfidence conf{conf_case.beta4, conf_case.beta5};
105105
actual = gpu_get_confidence(sumexpd_vec, conf);
106106
} else {
107107
ZeroParameterConfidence conf;

0 commit comments

Comments
 (0)