diff --git a/CUDACore/Project.toml b/CUDACore/Project.toml
index 2c9992781e..820b3cf86e 100644
--- a/CUDACore/Project.toml
+++ b/CUDACore/Project.toml
@@ -53,7 +53,7 @@ ChainRulesCore = "1"
 EnzymeCore = "0.8.2"
 ExprTools = "0.1"
 GPUArrays = "11.5.4"
-GPUCompiler = "1.12"
+GPUCompiler = "1.13.1"
 GPUToolbox = "1.1"
 KernelAbstractions = "0.9.38"
 LLVM = "9.6"
diff --git a/CUDACore/src/device/intrinsics/math.jl b/CUDACore/src/device/intrinsics/math.jl
index 95735015f2..16f9cc8738 100644
--- a/CUDACore/src/device/intrinsics/math.jl
+++ b/CUDACore/src/device/intrinsics/math.jl
@@ -1,5 +1,8 @@
 # math functionality
 
+# we only use libdevice where needed. if possible, we go through LLVM instead,
+# ideally relying on Julia's existing definitions.
+
 @public fma, rsqrt, saturate, byte_perm, assume
 @public add_rn, add_rz, add_rm, add_rp
 @public sub_rn, sub_rz, sub_rm, sub_rp
@@ -286,16 +289,6 @@ end
 
 ## floating-point handling
 
-@device_override Base.isfinite(x::Float32) = (ccall("extern __nv_finitef", llvmcall, Int32, (Cfloat,), x)) != 0
-@device_override Base.isfinite(x::Float64) = (ccall("extern __nv_isfinited", llvmcall, Int32, (Cdouble,), x)) != 0
-
-@device_override Base.isinf(x::Float64) = (ccall("extern __nv_isinfd", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.isinf(x::Float32) = (ccall("extern __nv_isinff", llvmcall, Int32, (Cfloat,), x)) != 0
-
-@device_override Base.isnan(x::Float64) = (ccall("extern __nv_isnand", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.isnan(x::Float32) = (ccall("extern __nv_isnanf", llvmcall, Int32, (Cfloat,), x)) != 0
-# isnan(::Float16) inherits from Julia (x != x), which compiles to a single setp.neu.f16.
-
 @device_function nearbyint(x::Float64) = ccall("extern __nv_nearbyint", llvmcall, Cdouble, (Cdouble,), x)
 @device_function nearbyint(x::Float32) = ccall("extern __nv_nearbyintf", llvmcall, Cfloat, (Cfloat,), x)
 
@@ -303,29 +296,13 @@ end
 @device_function nextafter(x::Float32, y::Float32) = ccall("extern __nv_nextafterf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
 
-## sign handling
-
-@device_override Base.signbit(x::Float64) = (ccall("extern __nv_signbitd", llvmcall, Int32, (Cdouble,), x)) != 0
-@device_override Base.signbit(x::Float32) = (ccall("extern __nv_signbitf", llvmcall, Int32, (Cfloat,), x)) != 0
-
-@device_override Base.copysign(x::Float64, y::Float64) = ccall("extern __nv_copysign", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
-@device_override Base.copysign(x::Float32, y::Float32) = ccall("extern __nv_copysignf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-
-@device_override Base.abs(x::Int32) =   ccall("extern __nv_abs", llvmcall, Int32, (Int32,), x)
-@device_override Base.abs(f::Float64) = ccall("extern __nv_fabs", llvmcall, Cdouble, (Cdouble,), f)
-@device_override Base.abs(f::Float32) = ccall("extern __nv_fabsf", llvmcall, Cfloat, (Cfloat,), f)
-# abs(::Float16) inherits from Julia (abs_float intrinsic), lowering to and.b16.
-@device_override Base.abs(x::Int64) =   ccall("extern __nv_llabs", llvmcall, Int64, (Int64,), x)
-
 ## roots and powers
 
-@device_override Base.sqrt(x::Float64) = ccall("extern __nv_sqrt", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.sqrt(x::Float32) = ccall("extern __nv_sqrtf", llvmcall, Cfloat, (Cfloat,), x)
-# sqrt(::Float16) inherits from Julia (Float16(sqrt(Float32(x)))), routing through __nv_sqrtf.
-@device_override FastMath.sqrt_fast(x::Union{Float32, Float64}) = sqrt(x)
-
-@device_function rsqrt(x::Float64) = ccall("extern __nv_rsqrt", llvmcall, Cdouble, (Cdouble,), x)
-@device_function rsqrt(x::Float32) = ccall("extern __nv_rsqrtf", llvmcall, Cfloat, (Cfloat,), x)
+# NVPTX has native `rsqrt.approx.{f32,f64}`; call the intrinsic directly. The
+# obvious alternative, `@fastmath 1/sqrt(x)`, also lowers to `rsqrt.approx`
+# (via `PTXRSqrtFastPass`), but is too aggressive wrt. fast-math behavior.
+@device_function rsqrt(x::Float64) = ccall("llvm.nvvm.rsqrt.approx.d", llvmcall, Cdouble, (Cdouble,), x)
+@device_function rsqrt(x::Float32) = ccall("llvm.nvvm.rsqrt.approx.f", llvmcall, Cfloat, (Cfloat,), x)
 @device_function rsqrt(x::Float16) = Float16(rsqrt(Float32(x)))
 
 @device_override Base.cbrt(x::Float64) = ccall("extern __nv_cbrt", llvmcall, Cdouble, (Cdouble,), x)
@@ -395,15 +372,6 @@ end
 #@device_override Base.rint(x::Float64) = ccall("extern __nv_rint", llvmcall, Cdouble, (Cdouble,), x)
 #@device_override Base.rint(x::Float32) = ccall("extern __nv_rintf", llvmcall, Cfloat, (Cfloat,), x)
 
-@device_override Base.trunc(x::Float64) = ccall("extern __nv_trunc", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.trunc(x::Float32) = ccall("extern __nv_truncf", llvmcall, Cfloat, (Cfloat,), x)
-
-@device_override Base.ceil(x::Float64) = ccall("extern __nv_ceil", llvmcall, Cdouble, (Cdouble,), x)
-@device_override Base.ceil(x::Float32) = ccall("extern __nv_ceilf", llvmcall, Cfloat, (Cfloat,), x)
-
-@device_override Base.floor(f::Float64) = ccall("extern __nv_floor", llvmcall, Cdouble, (Cdouble,), f)
-@device_override Base.floor(f::Float32) = ccall("extern __nv_floorf", llvmcall, Cfloat, (Cfloat,), f)
-
 #@device_override Base.min(x::Int32, y::Int32) = ccall("extern __nv_min", llvmcall, Int32, (Int32, Int32), x, y)
 #@device_override Base.min(x::Int64, y::Int64) = ccall("extern __nv_llmin", llvmcall, Int64, (Int64, Int64), x, y)
 #@device_override Base.min(x::UInt32, y::UInt32) = convert(UInt32, ccall("extern __nv_umin", llvmcall, Int32, (Int32, Int32), x, y))
@@ -508,27 +476,11 @@ end
 @device_override Base.rem(x::Float32, y::Float32, ::RoundingMode{:Nearest}) = ccall("extern __nv_remainderf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 @device_override Base.rem(x::Float16, y::Float16, ::RoundingMode{:Nearest}) = Float16(rem(Float32(x), Float32(y), RoundNearest))
 
-@device_override FastMath.div_fast(x::Float32, y::Float32) = ccall("extern __nv_fast_fdividef", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
-@device_override FastMath.div_fast(x::Float64, y::Float64) = x * FastMath.inv_fast(y)
-
-@device_override Base.inv(x::Float32) = ccall("extern __nv_frcp_rn", llvmcall, Cfloat, (Cfloat,), x)
-@device_override Base.inv(x::Float64) = ccall("extern __nv_drcp_rn", llvmcall, Cdouble, (Cdouble,), x)
-
-@device_override FastMath.inv_fast(x::Float32) = ccall("llvm.nvvm.rcp.approx.ftz.f", llvmcall, Float32, (Float32,), x)
-@device_override function FastMath.inv_fast(x::Float64)
-    # Get the approximate reciprocal
-    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
-    # This instruction chops off last 32bits of mantissa and computes inverse
-    # while treating all subnormal numbers as 0.0
-    # If reciprocal would be subnormal, underflows to 0.0
-    # 32 least significant bits of the result are filled with 0s
-    inv_x = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), x)
-
-    # Approximate the missing 32bits of mantissa with a single cubic iteration
-    e = fma(inv_x, -x, 1.0)
-    e = fma(e, e, e)
-    inv_x = fma(e, inv_x, inv_x)
-end
+# `Base.FastMath.inv_fast(::AbstractFloat)` is unimplemented upstream (only
+# `Complex` has a method) and the catch-all fallback drops `afn`
+@device_override FastMath.inv_fast(x::Union{Float16, Float32, Float64}) =
+    FastMath.div_fast(one(x), x)
+
 
 ## distributions
 
@@ -549,13 +501,20 @@ end
 @device_override Base.hypot(x::Float64, y::Float64) = ccall("extern __nv_hypot", llvmcall, Cdouble, (Cdouble, Cdouble), x, y)
 @device_override Base.hypot(x::Float32, y::Float32) = ccall("extern __nv_hypotf", llvmcall, Cfloat, (Cfloat, Cfloat), x, y)
 
-@device_override Base.fma(x::Float64, y::Float64, z::Float64) = ccall("llvm.fma.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.fma(x::Float32, y::Float32, z::Float32) = ccall("llvm.fma.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
-@device_override Base.fma(x::Float16, y::Float16, z::Float16) = ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
-
-@device_override Base.muladd(x::Float64, y::Float64, z::Float64) = ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
-@device_override Base.muladd(x::Float32, y::Float32, z::Float32) = ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
-@device_override Base.muladd(x::Float16, y::Float16, z::Float16) = ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+# `Base.fma(::Float16,...)` branches on `jl_have_fma`
+@device_override Base.fma(x::Float16, y::Float16, z::Float16) =
+    ccall("llvm.fma.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
+
+# `Base.muladd(x, y, z) = fma(x, y, z)` is the natural choice on GPU: NVPTX
+# always lowers `llvm.fmuladd.fXX` to `fma.rn`, and routing through
+# `llvm.fmuladd` (rather than Julia's default `fmul contract + fadd contract`)
+# keeps the fusion robust under vectorization (per JuliaGPU/CUDA.jl#3149).
+@device_override Base.muladd(x::Float64, y::Float64, z::Float64) =
+    ccall("llvm.fmuladd.f64", llvmcall, Cdouble, (Cdouble, Cdouble, Cdouble), x, y, z)
+@device_override Base.muladd(x::Float32, y::Float32, z::Float32) =
+    ccall("llvm.fmuladd.f32", llvmcall, Cfloat, (Cfloat, Cfloat, Cfloat), x, y, z)
+@device_override Base.muladd(x::Float16, y::Float16, z::Float16) =
+    ccall("llvm.fmuladd.f16", llvmcall, Float16, (Float16, Float16, Float16), x, y, z)
 
 # Directed rounding for binary arithmetic and fma. NVPTX exposes
 # `{add,mul,div,fma}.{rn,rz,rm,rp}.{f32,f64}` directly; there is no `sub`
diff --git a/test/Project.toml b/test/Project.toml
index b37fcf9ed4..510d5844af 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -11,6 +11,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
diff --git a/test/core/codegen.jl b/test/core/codegen.jl
index db431db4ed..1941f289cc 100644
--- a/test/core/codegen.jl
+++ b/test/core/codegen.jl
@@ -1,61 +1,59 @@
 @testset "LLVM IR" begin
 
 @testset "JuliaLang/julia#21121" begin
-    function foobar()
+    @test @filecheck CUDA.code_llvm(Tuple{}) do
+        @check_not "inttoptr"
         weight_matrix = CuStaticSharedArray(Float32, (16, 16))
         sync_threads()
         weight_matrix[1, 16] *= 2
         sync_threads()
     end
-
-    ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}))
-    @test !occursin("inttoptr", ir)
 end
 
 @testset "CUDA.jl#553" begin
-    function kernel(ptr)
-       unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-       return
+    @test @filecheck CUDA.code_llvm(Tuple{Ptr{Float32}}) do ptr
+        @check_not "@__nv_fmaf"
+        unsafe_store!(ptr, CUDA.fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+        return
     end
-
-    ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{Ptr{Float32}}))
-    @test !occursin("@__nv_fmaf", ir)
 end
 
 @testset "fma uses LLVM intrinsic" begin
-    function fma_kernel(ptr)
-        unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-        return
-    end
-
     for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
-        ir = sprint(io->CUDA.code_llvm(io, fma_kernel, Tuple{Ptr{T}}))
-        @test occursin("llvm.fma.$suffix", ir)
-        @test !occursin("__nv_fma", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
+            @check "llvm.fma.$suffix"
+            @check_not "__nv_fma"
+            unsafe_store!(ptr, fma(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+            return
+        end
     end
 end
 
 @testset "muladd uses LLVM intrinsic" begin
-    function muladd_kernel(ptr)
-        unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
-        return
-    end
-
+    # `Base.muladd` emits `fmul contract + fadd contract` upstream, which the
+    # backend usually fuses to `fma.rn`. On GPU the fusion is unreliable under
+    # vectorization (JuliaGPU/CUDA.jl#3149), so the override routes through
+    # `llvm.fmuladd.fXX` directly.
     for (T, suffix) in ((Float32, "f32"), (Float64, "f64"), (Float16, "f16"))
-        ir = sprint(io->CUDA.code_llvm(io, muladd_kernel, Tuple{Ptr{T}}))
-        @test occursin("llvm.fmuladd.$suffix", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{Ptr{T}}) do ptr
+            @check "llvm.fmuladd.$suffix"
+            unsafe_store!(ptr, muladd(unsafe_load(ptr), unsafe_load(ptr,2), unsafe_load(ptr,3)))
+            return
+        end
     end
 end
 
 @testset "assume" begin
-    foo(i) = cld(42, i)
-    ir = sprint(io->CUDA.code_llvm(io, foo, Tuple{Int}))
-    @test occursin("@gpu_report_exception", ir)
-
+    @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
+        @check "@gpu_report_exception"
+        cld(42, i)
+    end
 
-    bar(i) = (CUDA.assume(i > 0); cld(42, i))
-    ir = sprint(io->CUDA.code_llvm(io, bar, Tuple{Int}))
-    @test !occursin("gpu_report_exception", ir)
+    @test @filecheck CUDA.code_llvm(Tuple{Int}) do i
+        @check_not "gpu_report_exception"
+        CUDA.assume(i > 0)
+        cld(42, i)
+    end
 end
 
 @testset "stripping invariant.load" begin
@@ -144,88 +142,30 @@ end
 @testset "PTX" begin
 
 @testset "always_inline" begin
-    function f_expensive(x)
-        Base.Cartesian.@nexprs 30 i -> x = sin(x)+i
-    end
-
-    function g(x)
-        f_expensive(x)
-        return
-    end
-    function h(x)
-        f_expensive(x)
-        return
+    # without `always_inline`, the helper survives as a separate `.func`;
+    # with it set, the helper is inlined and no `.func julia_f_expensive`
+    # declaration remains. The closure-form lambdas below recreate the
+    # `f_expensive` helper at each test site, so each parent has its own
+    # call edge to verify the kwarg sticks.
+    f_expensive(x) = (Base.Cartesian.@nexprs 30 i -> x = sin(x)+i; x)
+    for always_inline in (false, true)
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}; always_inline) do x
+            @check     cond=!always_inline "{{\\.func .*julia_f_expensive}}"
+            @check_not cond=always_inline  "{{\\.func .*julia_f_expensive}}"
+            f_expensive(x)
+            return
+        end
     end
-
-    asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}))
-    @test occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, g, Tuple{Float64}; always_inline=true))
-    @test !occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}; always_inline=true))
-    @test !occursin(r"\.func .*julia_f_expensive", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, h, Tuple{Float64}))
-    @test occursin(r"\.func .*julia_f_expensive", asm)
 end
 
 @testset "local memory stores due to byval" begin
     # JuliaGPU/GPUCompiler.jl#92
-    function kernel(y1, y2)
+    @test @filecheck CUDA.code_ptx(NTuple{2,CuDeviceArray{Float32,1,AS.Global}}) do y1, y2
+        @check_not ".local"
         y = threadIdx().x == 1 ? y1 : y2
         @inbounds y[] = 0
         return
     end
-
-    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
-    @test !occursin(".local", asm)
-end
-
-@testset "fastmath" begin
-    function div_kernel(x)
-        i = threadIdx().x
-        @fastmath @inbounds x[i] = 1 / x[i]
-        return
-    end
-
-    asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("div.approx.ftz", asm)
-
-    function sqrt_kernel(x)
-        i = threadIdx().x
-        @inbounds x[i] = sqrt(x[i])
-        return
-    end
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
-    @test occursin("sqrt.r", asm)
-
-    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
-    @test occursin("sqrt.approx.ftz", asm)
-end
-
-@testset "fma/muladd emit fma.rn" begin
-    # fma and muladd should both lower to fma.rn in PTX
-    function fma_kernel(a, b, c)
-        @inbounds a[] = fma(b[], c[], a[])
-        return
-    end
-    function muladd_kernel(a, b, c)
-        @inbounds a[] = muladd(b[], c[], a[])
-        return
-    end
-
-    for T in (Float16, Float32, Float64)
-        asm = sprint(io->CUDA.code_ptx(io, fma_kernel,
-            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
-        @test occursin("fma.rn", asm)
-        @test !occursin("__nv_fma", asm)
-
-        asm = sprint(io->CUDA.code_ptx(io, muladd_kernel,
-            NTuple{3,CuDeviceArray{T,1,AS.Global}}))
-        @test occursin("fma.rn", asm)
-    end
 end
 
 @testset "header rewrite (.target/.version bump)" begin
diff --git a/test/core/device/array.jl b/test/core/device/array.jl
index 240fed288b..f8f5b7a142 100644
--- a/test/core/device/array.jl
+++ b/test/core/device/array.jl
@@ -68,22 +68,23 @@ end
 
 @testset "bounds checking" begin
     @testset "#313" begin
-        function kernel(dest)
+        kernel = dest -> (dest[1] = 1; nothing)
+        tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
+                            Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
+        @test @filecheck CUDA.code_llvm(tt) do dest
+            @check_not "jl_invoke"
             dest[1] = 1
             nothing
         end
-        tt = Tuple{SubArray{Float64,2,CuDeviceArray{Float64,2,AS.Global},
-                            Tuple{UnitRange{Int64},UnitRange{Int64}},false}}
-
-        ir = sprint(io->CUDA.code_llvm(io, kernel, tt))
-        @test !occursin("jl_invoke", ir)
+        # also smoke-test that PTX codegen succeeds for this signature.
         CUDA.code_ptx(devnull, kernel, tt)
     end
 
     # test that we don't do needless bounds checking when the kernel already does it
     # (enabled by the fact that we store `len` next to `dims`)
-    let
-        function kernel(A)
+    for N in 1:3
+        @test @filecheck CUDA.code_llvm(Tuple{CuDeviceArray{Int,N,AS.Global}}) do A
+            @check_not "boundserror"
             idx = threadIdx().x
             if idx <= length(A)
                 # we did our own bounds checking, so no check should be left!
@@ -91,11 +92,6 @@ end
             end
             return
         end
-
-        for N in 1:3
-            ir = sprint(io->CUDA.code_llvm(io, kernel, Tuple{CuDeviceArray{Int,N,AS.Global}}))
-            @test !occursin("boundserror", ir)
-        end
     end
 end
 
diff --git a/test/core/device/intrinsics.jl b/test/core/device/intrinsics.jl
index 2491e33423..504ef8c3b5 100644
--- a/test/core/device/intrinsics.jl
+++ b/test/core/device/intrinsics.jl
@@ -31,10 +31,10 @@
     @on_device active_mask()
 
     @testset "range metadata" begin
-        foobar() = threadIdx().x
-        ir = sprint(io->CUDA.code_llvm(io, foobar, Tuple{}; raw=true))
-
-        @test occursin(r"call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range", ir)
+        @test @filecheck CUDA.code_llvm(Tuple{}; raw=true) do
+            @check "{{call .+ @llvm.nvvm.read.ptx.sreg.tid.x.+ !range}}"
+            threadIdx().x
+        end
     end
 end
 
diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
index 1843f6213f..4ad292390c 100644
--- a/test/core/device/intrinsics/math.jl
+++ b/test/core/device/intrinsics/math.jl
@@ -338,72 +338,34 @@ using SpecialFunctions
 
     @testset "@fastmath sincos" begin
         # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
-        function kernel(a, b, c)
+        @test @filecheck CUDA.code_ptx(NTuple{3,CuDeviceArray{Float32,1,AS.Global}}) do a, b, c
+            @check "sin.approx.f32"
+            @check "cos.approx.f32"
+            @check_not "__nv"  # from libdevice
             @inbounds b[], c[] = @fastmath sincos(a[])
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
-        @assert contains(asm, "sin.approx.f32")
-        @assert contains(asm, "cos.approx.f32")
-        @assert !contains(asm, "__nv")  # from libdevice
     end
 
     @testset "inv" begin
-        # Base.inv should use accurate rcp instructions (rcp.rn)
+        # Base.inv should use accurate rcp instructions (rcp.rn).
+        # PTX-level patterns for inv / inv_fast / div / div_fast live in
+        # `test/core/math.jl`; here we only sanity-check correctness on GPU.
         for T in (Float32, Float64)
             @test testf(x -> inv.(x), rand(T, 10) .+ T(0.1))
             @test testf(x -> inv.(x), T[0.1, 0.5, 1.0, 2.0, 10.0, 100.0])
         end
-
-        function kernel_inv_f32(a)
-            @inbounds a[] = inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
-        @test contains(asm, "rcp.rn.f32")
-
-        function kernel_inv_f64(a)
-            @inbounds a[] = inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
-        @test contains(asm, "rcp.rn.f64")
     end
 
     @testset "inv_fast" begin
-        # inv_fast(Float32) uses rcp.approx.ftz.f32 (~14 bits of mantissa)
-        function kernel_inv_fast_f32(a)
-            @inbounds a[] = @fastmath inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f32, NTuple{1, CuDeviceArray{Float32, 1, AS.Global}}))
-        @test contains(asm, "rcp.approx.ftz.f32")
-
         fast_inv(x) = @fastmath inv(x)
         xs32 = Float32[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
         @test Array(map(fast_inv, cu(xs32))) ≈ inv.(xs32) rtol = 1.0f-4
-
-        # inv_fast(Float64) uses rcp.approx.ftz.f64 refined with Newton-Raphson
-        function kernel_inv_fast_f64(a)
-            @inbounds a[] = @fastmath inv(a[])
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_inv_fast_f64, NTuple{1, CuDeviceArray{Float64, 1, AS.Global}}))
-        @test contains(asm, "rcp.approx.ftz.f64")
-
         xs64 = Float64[0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
         @test Array(map(fast_inv, CuArray(xs64))) ≈ inv.(xs64) rtol = 1.0e-10
     end
 
     @testset "div_fast Float64" begin
-        # FastMath.div_fast(Float64) uses fast reciprocal: x * inv_fast(y)
-        function kernel_div_fast_f64(a, b, c)
-            @inbounds c[] = @fastmath a[] / b[]
-            return
-        end
-        asm = sprint(io -> CUDA.code_ptx(io, kernel_div_fast_f64, NTuple{3, CuDeviceArray{Float64, 1, AS.Global}}))
-        @test contains(asm, "rcp.approx.ftz.f64")
-
         fast_div(x, y) = @fastmath x / y
         xs = rand(Float64, 10) .+ 0.1
         ys = rand(Float64, 10) .+ 0.1
@@ -420,4 +382,246 @@ using SpecialFunctions
             @test isequal(maximum(AT([NaN])), NaN)
         end
     end
+
+    # PTX lowering pins for the standard math ops. Most of these used to
+    # require `@device_override`s pointing at libdevice; now they're handled
+    # by Julia + the NVPTX backend + GPUCompiler's `apply_fastmath!`,
+    # `PTXFDivFastPass`, and `PTXFSqrtFastPass`. Each testset pins the actual
+    # PTX so the wiring stays put across {f32, f64} × {plain, `@fastmath`} ×
+    # {default, job-wide `fastmath=true`}.
+
+    @testset "abs PTX" begin
+        for fastmath in (false, true)
+            # f32: job-wide fastmath flips to the `.ftz` variant.
+            @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
+                @check cond=fastmath  "abs.ftz.f32"
+                @check cond=!fastmath "abs.f32"
+                @check_not "__nv_"
+                abs(x)
+            end
+            # f64: no FTZ on PTX for f64.
+            @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
+                @check "abs.f64"
+                @check_not "__nv_"
+                abs(x)
+            end
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Int32}) do x
+            @check "abs.s32"
+            @check_not "__nv_"
+            abs(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Int64}) do x
+            @check "abs.s64"
+            @check_not "__nv_"
+            abs(x)
+        end
+    end
+
+    @testset "floor/ceil/trunc PTX" begin
+        for (op, rnd) in ((floor, "rmi"), (ceil, "rpi"), (trunc, "rzi"))
+            for fastmath in (false, true)
+                @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath) do x
+                    @check cond=fastmath  "cvt.$rnd.ftz.f32.f32"
+                    @check cond=!fastmath "cvt.$rnd.f32.f32"
+                    @check_not "__nv_"
+                    op(x)
+                end
+                @test @filecheck CUDA.code_ptx(Tuple{Float64}; fastmath) do x
+                    @check "cvt.$rnd.f64.f64"
+                    @check_not "__nv_"
+                    op(x)
+                end
+            end
+        end
+    end
+
+    @testset "isnan/isinf/isfinite PTX" begin
+        # All three should be pure FP compares / bit-tests, no libdevice.
+        for T in (Float32, Float64), op in (isnan, isinf, isfinite)
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check_not "__nv_"
+                op(x)
+            end
+        end
+        # `isnan(x) = x != x` is the cleanest: a single `setp.nan.fXX`.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+            @check "setp.nan.f32"
+            isnan(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+            @check "setp.nan.f64"
+            isnan(x)
+        end
+    end
+
+    @testset "signbit PTX" begin
+        for T in (Float32, Float64)
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check_not "__nv_"
+                signbit(x)
+            end
+        end
+    end
+
+    @testset "copysign PTX" begin
+        # NVPTX has no single copysign instruction (custom-lowered to bit ops);
+        # we just verify libdevice isn't on the path.
+        for T in (Float32, Float64)
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check_not "__nv_"
+                copysign(x, y)
+            end
+        end
+    end
+
+    @testset "min/max PTX" begin
+        # Plain `min`/`max` propagate NaN (Julia semantics). f32 with sm_80+
+        # + LLVM 14+ gets `min.NaN.f32`/`max.NaN.f32` directly; f64 has to
+        # emulate since PTX has no `.NaN` variant for f64. Pin `arch=sm"80"`
+        # so the test is deterministic regardless of the CI runner's device.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y
+            @check "min.NaN.f32"
+            min(x, y)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; arch=sm"80") do x, y
+            @check "max.NaN.f32"
+            max(x, y)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
+            @check_not "__nv_"
+            min(x, y)
+        end
+
+        # `@fastmath min/max` = `ifelse(y > x, x, y)`, a plain compare + select.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check "setp.lt.$s"
+                @check "selp.$s"
+                @fastmath min(x, y)
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check "setp.lt.$s"
+                @check "selp.$s"
+                @fastmath max(x, y)
+            end
+        end
+    end
+
+    @testset "fma/muladd PTX" begin
+        # `Base.fma` lowers to `llvm.fma.fXX` (have_fma branch folded for
+        # f32/f64 by GPUCompiler; for f16 we keep an explicit override).
+        # `Base.muladd` lowers to `fmul contract + fadd contract`, which the
+        # backend fuses. Either way: a single `fma.rn` per type.
+        for (T, s) in ((Float16, "f16"), (Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
+                @check "fma.rn.$s"
+                @check_not "__nv_fma"
+                fma(x, y, z)
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{T, T, T}) do x, y, z
+                @check "fma.rn.$s"
+                muladd(x, y, z)
+            end
+        end
+    end
+
+    @testset "sqrt PTX" begin
+        # Inherits from Julia (`llvm.sqrt.fXX`). Plain → `sqrt.rn.fXX`;
+        # per-call `@fastmath` → `sqrt.approx.fXX` (via `PTXFSqrtFastPass`);
+        # job-wide `fastmath=true` → the FTZ variant via `apply_fastmath!`.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check "sqrt.rn.$s"
+                @check_not "sqrt.approx"
+                sqrt(x)
+            end
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+            @check "sqrt.approx.f32"
+            @check_not "sqrt.approx.ftz"
+            @fastmath sqrt(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
+            @check "sqrt.approx.ftz.f32"
+            sqrt(x)
+        end
+        # NVPTX has no native fast f64 sqrt; backend builds it from rsqrt + rcp.
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+            @check "rsqrt.approx.f64"
+            @fastmath sqrt(x)
+        end
+    end
+
+    @testset "rsqrt PTX" begin
+        # `CUDA.rsqrt(x)` directly calls the NVPTX `rsqrt.approx.{f,d}`
+        # intrinsic — no libdevice, and no `@fastmath` so caller-side NaN/Inf
+        # checks aren't DCE'd by `nnan`/`ninf` propagation. f16 computes in
+        # f32, so it still hits the f32 instruction.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check "rsqrt.approx.$s"
+                @check_not "sqrt.approx"
+                @check_not "__nv_"
+                CUDA.rsqrt(x)
+            end
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float16}) do x
+            @check "rsqrt.approx.f32"
+            @check_not "__nv_"
+            CUDA.rsqrt(x)
+        end
+    end
+
+    @testset "div/inv PTX" begin
+        # `Base.{/, inv}` and their fast variants are handled by GPUCompiler's
+        # `PTXFDivFastPass`. `inv(x) = 1/x`; NVPTX pattern-matches
+        # `fdiv 1.0, x` to `rcp.rn`.
+        for (T, s) in ((Float32, "f32"), (Float64, "f64"))
+            @test @filecheck CUDA.code_ptx(Tuple{T, T}) do x, y
+                @check "div.rn.$s"
+                x / y
+            end
+            @test @filecheck CUDA.code_ptx(Tuple{T}) do x
+                @check "rcp.rn.$s"
+                inv(x)
+            end
+        end
+
+        # `@fastmath` on f32: pass picks the non-FTZ `div.approx.f32` since
+        # the job isn't fast; f64 always uses rcp+Newton.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}) do x, y
+            @check "div.approx.f32"
+            @check_not "div.approx.ftz"
+            @fastmath x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}) do x
+            @check "div.approx.f32"
+            @check_not "div.approx.ftz"
+            @fastmath inv(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}) do x, y
+            @check "rcp.approx.ftz.f64"
+            @fastmath x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64}) do x
+            @check "rcp.approx.ftz.f64"
+            @fastmath inv(x)
+        end
+
+        # Job-wide `fastmath=true` stamps `afn` on every fdiv → same as
+        # `@fastmath`, and f32 additionally picks up FTZ.
+        @test @filecheck CUDA.code_ptx(Tuple{Float32, Float32}; fastmath=true) do x, y
+            @check "div.approx.ftz.f32"
+            x / y
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float32}; fastmath=true) do x
+            @check "div.approx.ftz.f32"
+            inv(x)
+        end
+        @test @filecheck CUDA.code_ptx(Tuple{Float64, Float64}; fastmath=true) do x, y
+            @check "rcp.approx.ftz.f64"
+            x / y
+        end
+    end
 end
diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl
index f4ef5fdf4c..3281c56157 100644
--- a/test/core/device/intrinsics/wmma.jl
+++ b/test/core/device/intrinsics/wmma.jl
@@ -478,36 +478,26 @@ end
 
 @testset "Codegen addressing" begin
     @testset "Global" begin
-        function kernel(d)
+        @test @filecheck CUDA.code_ptx((CuDeviceArray{Float32,1,CUDA.AS.Global},)) do d
+            @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32}}"
+            @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}"
             conf = WMMA.Config{16, 16, 16, Float32}
-
             d_frag = WMMA.fill_c(Float32(0), conf)
             WMMA.store_d(pointer(d), d_frag, 16, WMMA.ColMajor, conf)
-
             return
         end
-
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
-
-        @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
-        @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
     end
 
     @testset "Shared" begin
-        function kernel()
+        @test @filecheck CUDA.code_ptx(()) do
+            @check "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32}}"
+            @check_not "{{wmma.store.d.sync(.aligned)?.col.m16n16k16.f32}}"
             shmem = CuStaticSharedArray(Float32, (16, 16))
             conf = WMMA.Config{16, 16, 16, Float32}
-
             d_frag = WMMA.fill_c(Float32(0), conf)
             WMMA.store_d(pointer(shmem), d_frag, 16, WMMA.ColMajor, conf)
-
             return
         end
-
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, ()))
-
-        @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
-        @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx)
     end
 end
 
diff --git a/test/core/device/ldg.jl b/test/core/device/ldg.jl
index 0385e4e34a..b25f953449 100644
--- a/test/core/device/ldg.jl
+++ b/test/core/device/ldg.jl
@@ -1,15 +1,14 @@
 @testset "ldg" begin
-    # NOTE: This is necessary because it seems that code_llvm has a bug which causes it to ignore
-    #       the method table. Wrapping it in a function gets us what we want currently but the PR
-    #       here: https://github.com/JuliaLang/julia/pull/60718 will likely fix this according to
-    #       @vchuravy. It is currently not backported.
-    ir = sprint(io->CUDA.code_llvm(io, (args...)->CUDACore.pointerref_ldg(args...), Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true))
-    if Base.libllvm_version >= v"20"
-        # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with
-        # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`.
-        @test occursin("!invariant.load", ir)
-    else
-        @test occursin("@llvm.nvvm.ldg", ir)
+    # NOTE: the wrapping function is necessary because code_llvm has a bug
+    # that causes it to ignore the method table; JuliaLang/julia#60718 will
+    # likely fix this according to @vchuravy.
+    # LLVM 20 removed `@llvm.nvvm.ldg.*`; we now emit a plain load with
+    # `!invariant.load` metadata, which NVPTX lowers to `ld.global.nc`.
+    new_llvm = Base.libllvm_version >= v"20"
+    @test @filecheck CUDA.code_llvm(Tuple{Core.LLVMPtr{Int,AS.Global},Int,Val{1}}; raw=true) do args...
+        @check cond=new_llvm  "!invariant.load"
+        @check cond=!new_llvm "@llvm.nvvm.ldg"
+        CUDACore.pointerref_ldg(args...)
     end
 end
 
diff --git a/test/core/execution.jl b/test/core/execution.jl
index 387af3547e..4f8a8d3495 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -61,16 +61,22 @@ end
     # to constructing the SMVersion directly.
     @cuda launch=false arch=dev_cap dummy()
     # but we should be able to see it in the generated PTX code
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm"50"))
-    @test contains(asm, ".target sm_50")
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=v"5.0"))
-    @test contains(asm, ".target sm_50")
+    @test @filecheck CUDA.code_ptx((); arch=sm"50") do
+        @check ".target sm_50"
+        dummy()
+    end
+    @test @filecheck CUDA.code_ptx((); arch=v"5.0") do
+        @check ".target sm_50"
+        dummy()
+    end
 
     # explicit `ptx=` is taken as an exact request (codegen-test affordance), so the
     # `.version` line should match what was asked for, independently of what LLVM and
     # ptxas would natively pick.
-    asm = sprint(io->CUDA.code_ptx(io, dummy, (); ptx=v"6.3"))
-    @test contains(asm, ".version 6.3")
+    @test @filecheck CUDA.code_ptx((); ptx=v"6.3") do
+        @check ".version 6.3"
+        dummy()
+    end
 
     # explicit `ptx=` is validated against BOTH LLVM and ptxas (not just LLVM as it
     # used to be); a clearly out-of-range value must error at config time.
@@ -84,14 +90,18 @@ end
     sm_f = SMVersion(dev_cap.major, dev_cap.minor, :family)
 
     if dev_cap >= v"9.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_a))
-        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+        @test @filecheck CUDA.code_ptx((); arch=sm_a) do
+            @check ".target $(CUDACore.cpu_name(sm_a))"
+            dummy()
+        end
         # arch-specific cubin should also actually launch on the matching device
         @cuda arch=sm_a dummy()
     end
     if dev_cap >= v"10.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, (); arch=sm_f))
-        @test contains(asm, ".target $(CUDACore.cpu_name(sm_f))")
+        @test @filecheck CUDA.code_ptx((); arch=sm_f) do
+            @check ".target $(CUDACore.cpu_name(sm_f))"
+            dummy()
+        end
         @cuda arch=sm_f dummy()
     end
 
@@ -103,8 +113,10 @@ end
     # since we know the exact device. The cuda-side `.target` is the variant regardless of
     # LLVM support (the mcgen rewrite stamps it in); only the LLVM-emitted code differs.
     if dev_cap >= v"9.0"
-        asm = sprint(io->CUDA.code_ptx(io, dummy, ()))
-        @test contains(asm, ".target $(CUDACore.cpu_name(sm_a))")
+        @test @filecheck CUDA.code_ptx(()) do
+            @check ".target $(CUDACore.cpu_name(sm_a))"
+            dummy()
+        end
     end
 
     # `target_feature_set()` reads back the feature set the *LLVM-emitted* code was built
diff --git a/test/helpers.jl b/test/helpers.jl
index 2809a4b4a8..7ad4eb1a5e 100644
--- a/test/helpers.jl
+++ b/test/helpers.jl
@@ -6,6 +6,7 @@
 using CUDA, CUDACore, GPUArrays
 using CUDA: i32
 using Adapt
+using FileCheck
 using ..Main: TestSuite, can_use_cupti, sanitize
 
 testf(@nospecialize(f), @nospecialize(xs...); kwargs...) =
diff --git a/test/setup.jl b/test/setup.jl
index dc28bd9dc9..2561cbe7a1 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -1,6 +1,7 @@
 using Test
 using CUDA
 using CUDACore
+using FileCheck
 using GPUArrays
 using NVML: has_nvml, NVML
 using ParallelTestRunner