diff --git a/CUDATools/src/profile.jl b/CUDATools/src/profile.jl index c6f37fdbb0..a171069406 100644 --- a/CUDATools/src/profile.jl +++ b/CUDATools/src/profile.jl @@ -934,8 +934,13 @@ function Base.show(io::IO, results::ProfileResults) println(io, "\nDevice-side activity: GPU was busy for $(format_time(device_time)) ($(format_percentage(device_ratio)) of the trace)") end - # add memory throughput information - device = merge(device, (; throughput=device.size ./ device.time)) + # add memory throughput information. CUPTI's timestamp resolution + # can round very short events down to 0 ns, so guard against the + # resulting Inf throughput (which would later trip up format_bytes). + throughput = map(device.size, device.time) do s, t + (s === missing || !isfinite(t) || t == 0) ? missing : s / t + end + device = merge(device, (; throughput)) if isempty(device.id) println(io, "\nNo device-side activity was recorded.") diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl index 4ad292390c..5b50685435 100644 --- a/test/core/device/intrinsics/math.jl +++ b/test/core/device/intrinsics/math.jl @@ -120,9 +120,11 @@ using SpecialFunctions end end - # NVPTX has no sub. intrinsic; sub_(x,y) reuses add_(x,-y). - # For non-rn modes LLVM keeps the rounded add; for rn (the default) it - # may fold back to a plain `sub`. + # NVPTX has no `llvm.nvvm.sub.` intrinsic, so sub_(x,y) is + # implemented as add_(x,-y). PTX itself does accept rounding + # modifiers on `sub`, so the backend may fold back to a real + # `sub..` (LLVM 22) or keep the rounded add (older LLVM). + # For `rn` the suffix may be elided entirely. for rnd in (:rn, :rz, :rm, :rp) f = getfield(CUDA, Symbol(:sub_, rnd)) for (T, suffix) in ((Float32, "f32"), (Float64, "f64")) @@ -130,8 +132,9 @@ using SpecialFunctions buf = CuArray{T}(undef, 1) ptx = sprint(io->(@device_code_ptx io=io @cuda launch=false kernel(buf, T(1), T(1)))) accepted = rnd === :rn ? - ("add.rn.$(suffix)", "add.$(suffix)", "sub.$(suffix)") : - ("add.$(rnd).$(suffix)",) + ("add.rn.$(suffix)", "add.$(suffix)", + "sub.rn.$(suffix)", "sub.$(suffix)") : + ("add.$(rnd).$(suffix)", "sub.$(rnd).$(suffix)") @test any(s -> occursin(s, ptx), accepted) end end