From 0ec8b9cf7513c4c5d5167b9fafef20d83f3cb866 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 9 Feb 2026 16:18:44 +0100
Subject: [PATCH 01/17] Emit our intrinsics as :calls instead of :invoke.

That avoids CIs, allowing easier spoofing of effects through efuncs.
---
 src/compiler/codegen/expressions.jl |  4 +--
 src/compiler/interface.jl           | 44 ++++++++++++++++++++---------
 src/compiler/intrinsics.jl          | 11 +-------
 src/compiler/intrinsics/atomics.jl  |  9 ++++--
 src/compiler/intrinsics/memory.jl   |  3 +-
 src/compiler/intrinsics/misc.jl     |  3 +-
 src/compiler/intrinsics/views.jl    |  3 +-
 7 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl
index 02b7c38..9abe616 100644
--- a/src/compiler/codegen/expressions.jl
+++ b/src/compiler/codegen/expressions.jl
@@ -79,9 +79,7 @@ function emit_call!(ctx::CGCtx, expr::Expr, @nospecialize(result_type))
     func = get_constant(ctx, args[1])
     call_args = args[2:end]
 
-    # TODO: This is normally dynamic dispatch, which we should allow.
-    #       However, we currently trigger this when emitting Julia intrinsics.
-    #       We should switch to our own intrinsics entirely, which are only invoked.
+    # We enter here for dynamic dispatch, but also for all intrinsic functions.
 
     @static if isdefined(Core, :throw_methoderror)
         if func === Core.throw_methoderror
diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index bdc6e8a..ebc5607 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -74,15 +74,21 @@ CC.may_compress(::cuTileInterpreter) = true
 CC.may_discard_trees(::cuTileInterpreter) = false
 
 #=============================================================================
- Custom return-type inference (tfuncs) for intrinsics
+ Custom inference for intrinsics
 =============================================================================#
 
-# Per-intrinsic return type overrides using multiple dispatch.
+# Per-intrinsic return type overrides.
 # Returns nothing when no override applies (fallback).
-# Concrete per-intrinsic methods are defined in intrinsics/ (after the
-# Intrinsics module exists).
 tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing
 
+# Per-intrinsic effect overrides.
+# Returns nothing when no override applies (fallback).
+efunc(@nospecialize(f), effects::CC.Effects) = nothing
+
+# Predicate for functions defined in the Intrinsics module.
+# These get NoCallInfo() so they stay as Expr(:call) rather than Expr(:invoke).
+isintrinsic(@nospecialize(f)) = isa(f, Function) && parentmodule(f) === Intrinsics
+
 #=============================================================================
  Subprogram inference for reduce/scan
 =============================================================================#
@@ -172,9 +178,10 @@ end
         result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any,
             arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing},
             sv::CC.InferenceState, max_methods::Int)
+        is_intr = isintrinsic(f)
         rt_override = tfunc(f, arginfo.argtypes)
         subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv)
-        rt_override === nothing && subprog === nothing && return result
+        !is_intr && rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
         push!(sv.tasks, function (interp′, sv′)
             isready(result) || return false
@@ -182,8 +189,11 @@ end
             cm = result[]
             sp = subprog !== nothing ? subprog[] : nothing
             rt = rt_override !== nothing ? rt_override : cm.rt
-            info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info
-            wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements)
+            efunc_override = is_intr ? efunc(f, cm.effects) : nothing
+            effects = efunc_override !== nothing ? efunc_override : cm.effects
+            info = is_intr ? CC.NoCallInfo() : cm.info
+            info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
+            wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
             return true
         end)
         return wrapped
@@ -195,9 +205,10 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
         result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any,
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.InferenceState, max_methods::Int)
+        is_intr = isintrinsic(f)
         rt_override = tfunc(f, arginfo.argtypes)
         subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv)
-        rt_override === nothing && subprog === nothing && return result
+        !is_intr && rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
         push!(sv.tasks, function (interp′, sv′)
             isready(result) || return false
@@ -205,8 +216,11 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
             cm = result[]
             sp = subprog !== nothing ? subprog[] : nothing
             rt = rt_override !== nothing ? rt_override : cm.rt
-            info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info
-            wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements)
+            efunc_override = is_intr ? efunc(f, cm.effects) : nothing
+            effects = efunc_override !== nothing ? efunc_override : cm.effects
+            info = is_intr ? CC.NoCallInfo() : cm.info
+            info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
+            wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
             return true
         end)
         return wrapped
@@ -219,10 +233,14 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.AbsIntState, max_methods::Int)
         _infer_subprogram(interp, f, arginfo, si, nothing, sv)  # side-effect only
+        is_intr = isintrinsic(f)
         rt_override = tfunc(f, arginfo.argtypes)
-        if rt_override !== nothing
-            return CC.CallMeta(rt_override, result.exct, result.effects,
-                               result.info)
+        rt = rt_override !== nothing ? rt_override : result.rt
+        efunc_override = is_intr ? efunc(f, result.effects) : nothing
+        effects = efunc_override !== nothing ? efunc_override : result.effects
+        info = is_intr ? CC.NoCallInfo() : result.info
+        if is_intr || rt_override !== nothing
+            return CC.CallMeta(rt, result.exct, effects, info)
         end
         return result
     end
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index aa0d425..1aa42d5 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -4,7 +4,7 @@
 
 module Intrinsics
 
-using Base: compilerbarrier, donotdelete
+using Base: compilerbarrier
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
@@ -19,15 +19,6 @@ end
 #       Sometimes that's not possible, e.g., because the functionality required for that is
 #       overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those
 #       intrinsics we disable constant folding using a `compilerbarrier(:const)`
-#
-# NOTE: Side-effectful intrinsics (stores, atomics) use `donotdelete(args...)` in their
-#       bodies to prevent the optimizer from DCE'ing calls. `donotdelete` is a Julia builtin
-#       with `effect_free=ALWAYS_FALSE`, which inference propagates through the function body.
-#       `@assume_effects !:effect_free` does NOT work — `override_effects` can only strengthen
-#       effects (set ALWAYS_TRUE), not weaken them. Spoofing `ipo_effects` via a custom
-#       `CC.finish!` override is possible but fragile (must race against `finishinfer!` setting
-#       `use_const_api` based on pre-override effects). `donotdelete` is the simplest correct
-#       approach.
 
 emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
 
diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index 3c89bd4..faabebf 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -41,10 +41,11 @@ end
     """
     @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired,
                                    memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete()
         compilerbarrier(:const, zero(T))::T
     end
 end
+efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -179,10 +180,11 @@ end
     """
     @noinline function atomic_xchg(array::TileArray{T, N}, index, val,
                                     memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete()
         compilerbarrier(:const, zero(T))
     end
 end
+efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args)
     emit_atomic_rmw!(ctx, args, AtomicXCHG)
 end
@@ -198,10 +200,11 @@ end
     """
     @noinline function atomic_add(array::TileArray{T, N}, index, val,
                                    memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete()
         compilerbarrier(:const, zero(T))
     end
 end
+efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args)
     emit_atomic_rmw!(ctx, args, AtomicADD)
 end
diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl
index 1d42ad5..4db4b46 100644
--- a/src/compiler/intrinsics/memory.jl
+++ b/src/compiler/intrinsics/memory.jl
@@ -95,10 +95,11 @@ end
     @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
                                       latency::Union{Int, Nothing},
                                       mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
-        donotdelete()
         nothing
     end
 end
+efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl
index 0b9f332..19a8534 100644
--- a/src/compiler/intrinsics/misc.jl
+++ b/src/compiler/intrinsics/misc.jl
@@ -3,10 +3,11 @@
 # cuda_tile.assert
 @eval Intrinsics begin
     @noinline function assert(cond::Bool, message::String)
-        donotdelete(cond, message)
         nothing
     end
 end
+efunc(::typeof(Intrinsics.assert), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.assert), args)
     cond = @something emit_value!(ctx, args[1]) throw(IRError("assert: cannot resolve condition"))
     message = @something get_constant(ctx, args[2]) throw(IRError("assert: requires constant message"))
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index c8f1a88..1c6e7c6 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -378,10 +378,11 @@ end
                                              latency::Union{Int, Nothing},
                                              allow_tma::Bool,
                                              indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        donotdelete()
         nothing
     end
 end
+efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), args)
     cb = ctx.cb
     tt = ctx.tt

From 23c37db9462f2581cc9f0896bca16f49cc87c549 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 9 Feb 2026 17:08:32 +0100
Subject: [PATCH 02/17] Add const-prop tests.

---
 test/codegen/integration.jl | 46 +++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl
index 2e0cb80..a1503dc 100644
--- a/test/codegen/integration.jl
+++ b/test/codegen/integration.jl
@@ -688,6 +688,52 @@ end
                 end
             end
         end
+
+        @testset "float constant addition folds through addf" begin
+            @test @filecheck begin
+                @check_label "entry"
+                @check_not "addf"
+                @check "constant <f32: 5"
+                @check "mulf"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    scale = 2.0f0 + 3.0f0
+                    Base.donotdelete(tile .* scale)
+                    return
+                end
+            end
+        end
+
+        @testset "integer constant subtraction folds through subi" begin
+            @test @filecheck begin
+                @check_label "entry"
+                @check_not "subi"
+                @check "load_view"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a
+                    idx = Int32(5) - Int32(2)
+                    tile = ct.load(a, idx, (16,))
+                    Base.donotdelete(tile)
+                    return
+                end
+            end
+        end
+
+        @testset "float constant multiplication folds through mulf" begin
+            @test @filecheck begin
+                @check_label "entry"
+                @check "constant <f32: 6"
+                @check "broadcast"
+                @check "mulf"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    scale = 2.0f0 * 3.0f0
+                    Base.donotdelete(tile .* scale)
+                    return
+                end
+            end
+        end
     end
 end
 

From 42376a7b101c6c44324e4cba90306e794fa835e6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 9 Feb 2026 21:04:37 +0100
Subject: [PATCH 03/17] Simplify intrinsic definitions.

---
 Project.toml                           |   2 +
 src/compiler/interface.jl              |  19 ++
 src/compiler/intrinsics.jl             |  42 +++-
 src/compiler/intrinsics/arithmetic.jl  | 271 ++++++++++++++-----------
 src/compiler/intrinsics/atomics.jl     |  47 +----
 src/compiler/intrinsics/conversions.jl |  28 +--
 src/compiler/intrinsics/core.jl        | 209 ++++---------------
 src/compiler/intrinsics/math.jl        | 131 ++++--------
 src/compiler/intrinsics/memory.jl      |  47 ++---
 src/compiler/intrinsics/misc.jl        |   6 +-
 src/compiler/intrinsics/views.jl       |  74 ++-----
 11 files changed, 322 insertions(+), 554 deletions(-)

diff --git a/Project.toml b/Project.toml
index dd1c4ea..cdff353 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,6 +11,7 @@ BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8"
 CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d"
+ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
 IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93"
 
 [weakdeps]
@@ -31,4 +32,5 @@ BFloat16s = "0.6"
 CompilerCaching = "0.1"
 CUDA_Compiler_jll = "0.4"
 CUDA_Tile_jll = "13.1"
+ExprTools = "0.1"
 IRStructurizer = "0.1"
diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index ebc5607..9080e87 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -191,6 +191,11 @@ end
             rt = rt_override !== nothing ? rt_override : cm.rt
             efunc_override = is_intr ? efunc(f, cm.effects) : nothing
             effects = efunc_override !== nothing ? efunc_override : cm.effects
+            # Mark intrinsics as non-consistently-overlayed so callers can't be
+            # concrete-eval'd (not_callable() bodies would throw at runtime).
+            if is_intr
+                effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE)
+            end
             info = is_intr ? CC.NoCallInfo() : cm.info
             info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
             wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
@@ -218,6 +223,11 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
             rt = rt_override !== nothing ? rt_override : cm.rt
             efunc_override = is_intr ? efunc(f, cm.effects) : nothing
             effects = efunc_override !== nothing ? efunc_override : cm.effects
+            # Mark intrinsics as non-consistently-overlayed so callers can't be
+            # concrete-eval'd (not_callable() bodies would throw at runtime).
+            if is_intr
+                effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE)
+            end
             info = is_intr ? CC.NoCallInfo() : cm.info
             info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
             wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
@@ -238,6 +248,11 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
         rt = rt_override !== nothing ? rt_override : result.rt
         efunc_override = is_intr ? efunc(f, result.effects) : nothing
         effects = efunc_override !== nothing ? efunc_override : result.effects
+        # Mark intrinsics as non-consistently-overlayed so callers can't be
+        # concrete-eval'd (not_callable() bodies would throw at runtime).
+        if is_intr
+            effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE)
+        end
         info = is_intr ? CC.NoCallInfo() : result.info
         if is_intr || rt_override !== nothing
             return CC.CallMeta(rt, result.exct, effects, info)
@@ -247,6 +262,7 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
 end
 
 # Disable semi-concrete interpretation (broken with overlays per JuliaLang/julia#47349)
+# and block concrete eval for intrinsics (not_callable() bodies return dummy values).
 function CC.concrete_eval_eligible(interp::cuTileInterpreter,
     @nospecialize(f), result::CC.MethodCallResult, arginfo::CC.ArgInfo, sv::CC.InferenceState)
     ret = @invoke CC.concrete_eval_eligible(interp::CC.AbstractInterpreter,
@@ -254,6 +270,9 @@ function CC.concrete_eval_eligible(interp::cuTileInterpreter,
     if ret === :semi_concrete_eval
         return :none
     end
+    if ret === :concrete_eval && isintrinsic(f)
+        return :none
+    end
     return ret
 end
 
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 1aa42d5..4cb6d0a 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -12,13 +12,41 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal
 
 end
 
-# NOTE: Due to JuliaLang/julia#60583, intrinsics may be called during constant evaluation.
-#       Because of that, such intrinsics (such as basic arithmetic) need to provide an
-#       implementation that actually computes a valid result using Julia intrinsics.
-#
-#       Sometimes that's not possible, e.g., because the functionality required for that is
-#       overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those
-#       intrinsics we disable constant folding using a `compilerbarrier(:const)`
+# NOTE: Intrinsics are never directly folded (concrete_eval_eligible returns :none,
+#       nonoverlayed=ALWAYS_FALSE taints caller effects). However, overlay callers
+#       with @assume_effects :foldable override the propagated effects, causing the
+#       compiler to concrete-evaluate through intrinsic bodies (JuliaLang/julia#60583).
+#       Intrinsics on such paths need callable bodies (function definition form).
+#       All others use compilerbarrier(:type, nothing) as a dummy body (bare signature).
+
+using ExprTools: splitdef, combinedef
+
+"""
+    @intrinsic signature
+    @intrinsic function_definition
+
+Define a Tile IR intrinsic in the `Intrinsics` module.
+
+A bare signature (e.g. `@intrinsic foo(x)`) creates a dummy body using
+`compilerbarrier(:type, nothing)` so body inference returns `Any`. Actual
+return types come from `tfunc` overrides in the interpreter.
+
+A function definition (e.g. `@intrinsic foo(x) = expr`) preserves the body,
+providing a callable implementation for concrete evaluation. This is needed
+when overlay callers with `@assume_effects :foldable` cause the compiler to
+evaluate through intrinsic bodies (JuliaLang/julia#60583). The body should
+provide a correct scalar implementation using `Core.Intrinsics`, or return
+`nothing` for side-effect-only intrinsics.
+"""
+macro intrinsic(ex)
+    if ex isa Expr && ex.head in (:function, :(=))
+        funcdef = combinedef(splitdef(ex))
+    else
+        funcdef = Expr(:function, ex, quote compilerbarrier(:type, nothing) end)
+    end
+    funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef)
+    return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef)))))
+end
 
 emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
 
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 6272251..861731b 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -84,53 +84,60 @@ end
 ## Integer arithmetic
 
 # cuda_tile.absi
-@eval Intrinsics begin
-    """Integer absolute value. Compiled to cuda_tile.absi."""
-    @noinline absi(x::T) where {T<:Integer} =
-        ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x)
-    @noinline absi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a)
+@intrinsic absi(x::T) where {T<:Integer} =
+    ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x)
+@intrinsic absi(a::Tile)
+function tfunc(::typeof(Intrinsics.absi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absi), args)
     emit_unop!(ctx, args, encode_AbsIOp!)
 end
 
 # cuda_tile.addi
-@eval Intrinsics begin
-    @noinline addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y)
-    @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
+@intrinsic addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y)
+@intrinsic addi(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.addi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args)
     emit_binop!(ctx, args, encode_AddIOp!)
 end
 
 # cuda_tile.cldi (ceiling division, toward positive infinity)
-@eval Intrinsics begin
-    @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T))
-end
+@intrinsic cldi(x, y, s)
+tfunc(::typeof(Intrinsics.cldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingPositiveInf)
 end
 
 # cuda_tile.cmpi
-@eval Intrinsics begin
-    @noinline function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
-        if pred === CmpLessThan
-            s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-        elseif pred === CmpLessThanOrEqual
-            s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y)
-        elseif pred === CmpGreaterThan
-            s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x)
-        elseif pred === CmpGreaterThanOrEqual
-            s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x)
-        elseif pred === CmpEqual
-            Core.Intrinsics.eq_int(x, y)
-        else  # CmpNotEqual
-            Core.Intrinsics.ne_int(x, y)
-        end
+@intrinsic function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
+    if pred === CmpLessThan
+        s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
+    elseif pred === CmpLessThanOrEqual
+        s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y)
+    elseif pred === CmpGreaterThan
+        s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x)
+    elseif pred === CmpGreaterThanOrEqual
+        s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x)
+    elseif pred === CmpEqual
+        Core.Intrinsics.eq_int(x, y)
+    else  # CmpNotEqual
+        Core.Intrinsics.ne_int(x, y)
     end
-    @noinline cmpi(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate, ::Signedness) where {T<:Integer, S} =
-        Tile{Bool, S}()
+end
+@intrinsic cmpi(a::Tile, b::Tile, pred, s)
+function tfunc(::typeof(Intrinsics.cmpi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    if t <: Tile
+        S = t.parameters[2]
+        return Tile{Bool, S}
+    end
+    return nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
     cb = ctx.cb
@@ -156,10 +163,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
 end
 
 # cuda_tile.divi (truncating division, toward zero)
-@eval Intrinsics begin
-    @noinline function divi(x::T, y::T, s::Signedness) where {T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y)
-    end
+@intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer}
+    s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness"))
@@ -167,22 +172,22 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
 end
 
 # cuda_tile.fldi (floor division, toward negative infinity)
-@eval Intrinsics begin
-    @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T))
-end
+@intrinsic fldi(x, y, s)
+tfunc(::typeof(Intrinsics.fldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingNegativeInf)
 end
 
 # cuda_tile.maxi
-@eval Intrinsics begin
-    @noinline function maxi(x::T, y::T, s::Signedness) where {T<:Integer}
-        lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-        ifelse(lt, y, x)
-    end
-    @noinline maxi(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} =
-        Tile{T, S}()
+@intrinsic function maxi(x::T, y::T, s::Signedness) where {T<:Integer}
+    lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
+    ifelse(lt, y, x)
+end
+@intrinsic maxi(a::Tile, b::Tile, s)
+function tfunc(::typeof(Intrinsics.maxi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness"))
@@ -190,13 +195,14 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
 end
 
 # cuda_tile.mini
-@eval Intrinsics begin
-    @noinline function mini(x::T, y::T, s::Signedness) where {T<:Integer}
-        lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-        ifelse(lt, x, y)
-    end
-    @noinline mini(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} =
-        Tile{T, S}()
+@intrinsic function mini(x::T, y::T, s::Signedness) where {T<:Integer}
+    lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
+    ifelse(lt, x, y)
+end
+@intrinsic mini(a::Tile, b::Tile, s)
+function tfunc(::typeof(Intrinsics.mini), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness"))
@@ -204,40 +210,43 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
 end
 
 # cuda_tile.muli
-@eval Intrinsics begin
-    @noinline muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
-    @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
+@intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
+@intrinsic muli(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.muli), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args)
     emit_binop!(ctx, args, encode_MulIOp!)
 end
 
 # cuda_tile.mulhii
-@eval Intrinsics begin
-    """High bits of integer multiply (for extended precision arithmetic). Compiled to cuda_tile.mulhii."""
-    @noinline function mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
-        ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T
-    end
-    @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = Tile{T, S}()
+@intrinsic function mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
+    ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T
+end
+@intrinsic mulhii(a::Tile, b::Tile, s)
+function tfunc(::typeof(Intrinsics.mulhii), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args)
     emit_binop!(ctx, args, encode_MulhiIOp!)
 end
 
 # cuda_tile.negi
-@eval Intrinsics begin
-    @noinline negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x)
-    @noinline negi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a)
+@intrinsic negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x)
+@intrinsic negi(a::Tile)
+function tfunc(::typeof(Intrinsics.negi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args)
     emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone)
 end
 
 # cuda_tile.remi
-@eval Intrinsics begin
-    @noinline function remi(x::T, y::T, s::Signedness) where {T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y)
-    end
+@intrinsic function remi(x::T, y::T, s::Signedness) where {T<:Integer}
+    s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness"))
@@ -245,18 +254,14 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args)
 end
 
 # cuda_tile.shli
-@eval Intrinsics begin
-    @noinline shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T)
-end
+@intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args)
     emit_binop!(ctx, args, encode_ShLIOp!)
 end
 
 # cuda_tile.shri
-@eval Intrinsics begin
-    @noinline function shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T)
-    end
+@intrinsic function shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
+    s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness"))
@@ -264,9 +269,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args)
 end
 
 # cuda_tile.subi
-@eval Intrinsics begin
-    @noinline subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
-    @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
+@intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
+@intrinsic subi(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.subi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args)
     emit_binop!(ctx, args, encode_SubIOp!)
@@ -276,42 +283,51 @@ end
 ## Floating-point arithmetic
 
 # cuda_tile.absf
-@eval Intrinsics begin
-    @noinline absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x)
-    @noinline absf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a)
+@intrinsic absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x)
+@intrinsic absf(a::Tile)
+function tfunc(::typeof(Intrinsics.absf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args)
     emit_unop!(ctx, args, encode_AbsFOp!)
 end
 
 # cuda_tile.addf
-@eval Intrinsics begin
-    @noinline addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y)
-    @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
+@intrinsic addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y)
+@intrinsic addf(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.addf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args)
     emit_binop!(ctx, args, encode_AddFOp!)
 end
 
 # cuda_tile.cmpf
-@eval Intrinsics begin
-    @noinline function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
-        if pred === CmpLessThan
-            Core.Intrinsics.lt_float(x, y)
-        elseif pred === CmpLessThanOrEqual
-            Core.Intrinsics.le_float(x, y)
-        elseif pred === CmpGreaterThan
-            Core.Intrinsics.lt_float(y, x)
-        elseif pred === CmpGreaterThanOrEqual
-            Core.Intrinsics.le_float(y, x)
-        elseif pred === CmpEqual
-            Core.Intrinsics.eq_float(x, y)
-        else  # CmpNotEqual
-            Core.Intrinsics.ne_float(x, y)
-        end
+@intrinsic function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
+    if pred === CmpLessThan
+        Core.Intrinsics.lt_float(x, y)
+    elseif pred === CmpLessThanOrEqual
+        Core.Intrinsics.le_float(x, y)
+    elseif pred === CmpGreaterThan
+        Core.Intrinsics.lt_float(y, x)
+    elseif pred === CmpGreaterThanOrEqual
+        Core.Intrinsics.le_float(y, x)
+    elseif pred === CmpEqual
+        Core.Intrinsics.eq_float(x, y)
+    else  # CmpNotEqual
+        Core.Intrinsics.ne_float(x, y)
+    end
+end
+@intrinsic cmpf(a::Tile, b::Tile, pred)
+function tfunc(::typeof(Intrinsics.cmpf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    if t <: Tile
+        S = t.parameters[2]
+        return Tile{Bool, S}
     end
-    @noinline cmpf(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate) where {T<:AbstractFloat, S} =
-        Tile{Bool, S}()
+    return nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
     cb = ctx.cb
@@ -336,36 +352,44 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
 end
 
 # cuda_tile.divf
-@eval Intrinsics begin
-    @noinline divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y)
-    @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
+@intrinsic divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y)
+@intrinsic divf(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.divf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args)
     emit_binop!(ctx, args, encode_DivFOp!)
 end
 
 # cuda_tile.mulf
-@eval Intrinsics begin
-    @noinline mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y)
-    @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
+@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y)
+@intrinsic mulf(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.mulf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args)
     emit_binop!(ctx, args, encode_MulFOp!)
 end
 
 # cuda_tile.negf
-@eval Intrinsics begin
-    @noinline negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x)
-    @noinline negf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a)
+@intrinsic negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x)
+@intrinsic negf(a::Tile)
+function tfunc(::typeof(Intrinsics.negf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args)
     emit_unop!(ctx, args, encode_NegFOp!)
 end
 
 # cuda_tile.subf
-@eval Intrinsics begin
-    @noinline subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y)
-    @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
+@intrinsic subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y)
+@intrinsic subf(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.subf), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args)
     emit_binop!(ctx, args, encode_SubFOp!)
@@ -375,10 +399,11 @@ end
 ## Boolean arithmetic
 
 # cuda_tile.andi
-@eval Intrinsics begin
-    @noinline andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
-    """Element-wise logical AND for boolean tiles."""
-    @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
+@intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
+@intrinsic andi(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.andi), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
     cb = ctx.cb
@@ -396,10 +421,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
 end
 
 # cuda_tile.ori
-@eval Intrinsics begin
-    @noinline ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y)
-    """Element-wise logical OR for boolean tiles."""
-    @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
+@intrinsic ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y)
+@intrinsic ori(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.ori), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
     cb = ctx.cb
@@ -417,10 +443,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
 end
 
 # cuda_tile.xori
-@eval Intrinsics begin
-    @noinline xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y)
-    """Element-wise logical XOR for boolean tiles."""
-    @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
+@intrinsic xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y)
+@intrinsic xori(a::Tile, b::Tile)
+function tfunc(::typeof(Intrinsics.xori), argtypes::Vector{Any})
+    t = CC.widenconst(argtypes[2])
+    t <: Tile ? t : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
     cb = ctx.cb
diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index faabebf..79258fa 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -31,19 +31,9 @@ function memory_scope_to_scope(scope::Int)
 end
 
 # cuda_tile.atomic_cas_tko
-@eval Intrinsics begin
-    """
-        atomic_cas(array, index, expected, desired, memory_order, memory_scope)
-
-    Atomic compare-and-swap at 0-indexed position.
-    Returns the original value.
-    Compiled to cuda_tile.atomic_cas_tko.
-    """
-    @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired,
-                                   memory_order::Int, memory_scope::Int) where {T, N}
-        compilerbarrier(:const, zero(T))::T
-    end
-end
+@intrinsic atomic_cas(array, index, expected, desired,
+                      memory_order, memory_scope)
+tfunc(::typeof(Intrinsics.atomic_cas), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2]))
 efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args)
@@ -170,19 +160,8 @@ function emit_atomic_rmw!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode)
 end
 
 # cuda_tile.atomic_rmw_tko with XCHG
-@eval Intrinsics begin
-    """
-        atomic_xchg(array, index, val, memory_order, memory_scope)
-
-    Atomic exchange at 0-indexed position.
-    Returns the original value.
-    Compiled to cuda_tile.atomic_rmw_tko with XCHG.
-    """
-    @noinline function atomic_xchg(array::TileArray{T, N}, index, val,
-                                    memory_order::Int, memory_scope::Int) where {T, N}
-        compilerbarrier(:const, zero(T))
-    end
-end
+@intrinsic atomic_xchg(array, index, val, memory_order, memory_scope)
+tfunc(::typeof(Intrinsics.atomic_xchg), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2]))
 efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args)
@@ -190,19 +169,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args)
 end
 
 # cuda_tile.atomic_rmw_tko with ADD
-@eval Intrinsics begin
-    """
-        atomic_add(array, index, val, memory_order, memory_scope)
-
-    Atomic addition at 0-indexed position.
-    Returns the original value.
-    Compiled to cuda_tile.atomic_rmw_tko with ADD.
-    """
-    @noinline function atomic_add(array::TileArray{T, N}, index, val,
-                                   memory_order::Int, memory_scope::Int) where {T, N}
-        compilerbarrier(:const, zero(T))
-    end
-end
+@intrinsic atomic_add(array, index, val,
+                      memory_order, memory_scope)
+tfunc(::typeof(Intrinsics.atomic_add), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2]))
 efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args)
diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 6c33afc..6aa879f 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -3,10 +3,8 @@
 # TODO: cuda_tile.bitcast
 
 # cuda_tile.exti (scalar integer extension)
-@eval Intrinsics begin
-    @noinline function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x)
-    end
+@intrinsic function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
+    s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args)
     cb = ctx.cb
@@ -26,10 +24,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args)
 end
 
 # cuda_tile.ftof (scalar float to float)
-@eval Intrinsics begin
-    @noinline function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
-        sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x)
-    end
+@intrinsic function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
+    sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args)
     cb = ctx.cb
@@ -48,10 +44,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args)
 end
 
 # cuda_tile.ftoi (scalar float to integer)
-@eval Intrinsics begin
-    @noinline function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x)
-    end
+@intrinsic function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
+    s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args)
     cb = ctx.cb
@@ -71,10 +65,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args)
 end
 
 # cuda_tile.itof (scalar integer to float)
-@eval Intrinsics begin
-    @noinline function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
-        s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x)
-    end
+@intrinsic function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
+    s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args)
     cb = ctx.cb
@@ -94,9 +86,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args)
 end
 
 # cuda_tile.trunci (scalar integer truncation)
-@eval Intrinsics begin
-    @noinline trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x)
-end
+@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.trunci), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index b64fbcf..abe3f34 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -19,17 +19,7 @@ function validate_tile_shape(shape, context::String)
 end
 
 # cuda_tile.broadcast
-@eval Intrinsics begin
-    """
-        broadcast(tile, shape_val)
-
-    Explicitly broadcast a tile to a target shape.
-    Compiled to cuda_tile.broadcast.
-    """
-    @noinline function broadcast(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic broadcast(tile, shape)
 function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     tile_type = CC.widenconst(argtypes[2])
@@ -109,17 +99,7 @@ function broadcast_tile_to_shape!(cb::CodeBuilder, tt::TypeTable, tv::CGVal,
 end
 
 # cuda_tile.cat
-@eval Intrinsics begin
-    """
-        cat(tiles, axis_val)
-
-    Concatenate two tiles along 0-indexed axis.
-    Compiled to cuda_tile.cat.
-    """
-    @noinline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Integer) where {T, S1, S2}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic cat(tiles, axis)
 function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     tuple_type = CC.widenconst(argtypes[2])
@@ -186,17 +166,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args)
 end
 
 # cuda_tile.constant
-@eval Intrinsics begin
-    """
-        constant(shape, value, T)
-
-    Create a tile filled with a constant value.
-    Compiled to cuda_tile.constant.
-    """
-    @noinline function constant(shape::NTuple{N, Int}, value, ::Type{T}) where {N, T}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic constant(shape, value, T)
 function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any})
     length(argtypes) >= 4 || return nothing
     shape_arg = argtypes[2]
@@ -236,17 +206,7 @@ end
 # TODO: cuda_tile.entry
 
 # cuda_tile.extract
-@eval Intrinsics begin
-    """
-        extract(tile, index_val, shape_val)
-
-    Extract a sub-tile from tile at 0-indexed slice indices.
-    Compiled to cuda_tile.extract.
-    """
-    @noinline function extract(tile::Tile{T}, index::NTuple{N, Int}, shape::NTuple{N, Int}) where {T, N}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic extract(tile, index, shape)
 function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any})
     length(argtypes) >= 4 || return nothing
     tile_type = CC.widenconst(argtypes[2])
@@ -300,15 +260,8 @@ end
 # TODO: cuda_tile.get_global
 
 # cuda_tile.get_num_tile_blocks
-@eval Intrinsics begin
-    """
-        get_num_tile_blocks(axis)::Int32
-
-    Get the grid size along the given axis (0=x, 1=y, 2=z).
-    Compiled to cuda_tile.get_num_tile_blocks.
-    """
-    @noinline get_num_tile_blocks(axis::Integer) = compilerbarrier(:const, zero(Int32))
-end
+@intrinsic get_num_tile_blocks(axis)
+tfunc(::typeof(Intrinsics.get_num_tile_blocks), argtypes::Vector{Any}) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), args)
     axis = @something get_constant(ctx, args[1]) throw(IRError("get_num_tile_blocks() axis must be a compile-time constant"))
     axis in (0, 1, 2) || throw(IRError("get_num_tile_blocks() axis must be 0, 1, or 2, got $axis"))
@@ -320,15 +273,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), a
 end
 
 # cuda_tile.get_tile_block_id
-@eval Intrinsics begin
-    """
-        get_tile_block_id(axis)::Int32
-
-    Get the block ID along the given axis (0=x, 1=y, 2=z).
-    Compiled to cuda_tile.get_tile_block_id.
-    """
-    @noinline get_tile_block_id(axis::Integer) = compilerbarrier(:const, zero(Int32))
-end
+@intrinsic get_tile_block_id(axis)
+tfunc(::typeof(Intrinsics.get_tile_block_id), argtypes::Vector{Any}) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_tile_block_id), args)
     axis = @something get_constant(ctx, args[1]) throw(IRError("get_tile_block_id() axis must be a compile-time constant"))
     axis in (0, 1, 2) || throw(IRError("get_tile_block_id() axis must be 0, 1, or 2, got $axis"))
@@ -343,17 +289,7 @@ end
 # TODO: cuda_tile.global
 
 # cuda_tile.iota
-@eval Intrinsics begin
-    """
-        iota(shape, T)
-
-    Create a 1D tile with values [0, 1, 2, ..., shape[1]-1] (0-indexed).
-    Compiled to cuda_tile.iota.
-    """
-    @noinline function iota(shape::NTuple{1, Int}, ::Type{T}) where {T}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic iota(shape, T)
 function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     shape_arg = argtypes[2]
@@ -387,17 +323,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args)
 end
 
 # cuda_tile.mmaf, cuda_tile.mmai
-@eval Intrinsics begin
-    """
-        mma(a, b, acc)
-
-    Matrix-multiply-accumulate: result = a @ b + acc.
-    Compiled to cuda_tile.mmaf or cuda_tile.mmai.
-    """
-    @noinline function mma(a::Tile{T1}, b::Tile{T2}, acc::Tile{T3, SC}) where {T1, T2, T3, SC}
-        Tile{T3, SC}()
-    end
-end
+@intrinsic mma(a, b, acc)
+tfunc(::typeof(Intrinsics.mma), argtypes::Vector{Any}) = CC.widenconst(argtypes[4])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args)
     cb = ctx.cb
 
@@ -415,16 +342,16 @@ end
 # TODO: cuda_tile.module
 
 # cuda_tile.offset
-@eval Intrinsics begin
-    """
-        offset(base, offsets)
-
-    Compute base_ptr + offsets for each element of offsets tile (element-scaled).
-    Returns a tile of pointers. Compiled to cuda_tile.offset.
-    """
-    @noinline function offset(base::Ptr{T}, offsets::Tile{I, S}) where {T, I <: Integer, S}
-        Tile{Ptr{T}, S}()
-    end
+@intrinsic offset(base, offsets)
+function tfunc(::typeof(Intrinsics.offset), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    base_type = CC.widenconst(argtypes[2])
+    base_type <: Ptr || return nothing
+    offsets_type = CC.widenconst(argtypes[3])
+    offsets_type <: Tile || return nothing
+    T = eltype(base_type)
+    S = offsets_type.parameters[2]
+    return Tile{Ptr{T}, S}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.offset), args)
     cb = ctx.cb
@@ -469,17 +396,7 @@ end
 # TODO: cudatile.pack
 
 # cuda_tile.permute
-@eval Intrinsics begin
-    """
-        permute(tile, perm_val)
-
-    Permute tile dimensions according to 0-indexed permutation.
-    Compiled to cuda_tile.permute.
-    """
-    @noinline function permute(tile::Tile{T, S}, perm::NTuple{N, Int}) where {T, S, N}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic permute(tile, perm)
 function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     tile_type = CC.widenconst(argtypes[2])
@@ -529,17 +446,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args)
 end
 
 # cuda_tile.transpose
-@eval Intrinsics begin
-    """
-        transpose(tile)
-
-    Transpose a 2D tile, swapping its dimensions.
-    Compiled to cuda_tile.permute with perm=(1, 0).
-    """
-    @noinline function transpose(tile::Tile{T}) where {T}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic transpose(tile)
 function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any})
     length(argtypes) >= 2 || return nothing
     tile_type = CC.widenconst(argtypes[2])
@@ -576,24 +483,7 @@ end
 
 
 # cuda_tile.reduce
-@eval Intrinsics begin
-    """
-        reduce(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple) -> Tuple{Tile...}
-
-    Reduce tiles along a 0-indexed axis using combiner `f` with per-operand
-    identity values. Accepts and returns tuples of tiles; single-operand
-    callers wrap in 1-tuples and unwrap with `[1]`.
-    Compiled to cuda_tile.reduce.
-    """
-    @noinline function reduce(tiles::Tuple{Tile{T, S}}, axis::Integer, f,
-                              identities::Tuple{Any}) where {T, S}
-        compilerbarrier(:type, nothing)
-    end
-    @noinline function reduce(tiles::Tuple{Tile{T1, S}, Tile{T2, S}}, axis::Integer, f,
-                              identities::Tuple{Any, Any}) where {T1, T2, S}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic reduce(tiles, axis, f, identities)
 function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     tuple_type = CC.widenconst(argtypes[2])
@@ -724,17 +614,7 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityVal(to_uint128(T(val)), dtype, T)
 
 # cuda_tile.reshape
-@eval Intrinsics begin
-    """
-        reshape(tile, shape_val)
-
-    Reshape a tile to a new shape (same total elements).
-    Compiled to cuda_tile.reshape.
-    """
-    @noinline function reshape(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic reshape(tile, shape)
 function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     tile_type = CC.widenconst(argtypes[2])
@@ -803,21 +683,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args)
 end
 
 # cuda_tile.scan
-@eval Intrinsics begin
-    """
-        scan(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple, reverse=false) -> Tuple{Tile...}
-
-    Parallel prefix scan along a 0-indexed axis using combiner `f` with
-    per-operand identity values. Accepts and returns tuples of tiles;
-    single-operand callers wrap in 1-tuples and unwrap with `[1]`.
-    `reverse=true` for a reverse (suffix) scan.
-    Compiled to cuda_tile.scan.
-    """
-    @noinline function scan(tiles::Tuple{Tile{T, S}}, axis::Integer, f,
-                            identities::Tuple{Any}, reverse::Bool=false) where {T, S}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic scan(tiles, axis, f, identities, reverse=false)
 function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any})
     length(argtypes) >= 2 || return nothing
     tuple_type = CC.widenconst(argtypes[2])
@@ -916,17 +782,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
 end
 
 # cuda_tile.select
-@eval Intrinsics begin
-    """
-        select(cond, x, y)
-
-    Element-wise conditional selection.
-    Compiled to cuda_tile.select.
-    """
-    @noinline select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y)
-    @noinline function select(cond::Tile{Bool, S}, x::Tile{T, S}, y::Tile{T, S}) where {T, S}
-        Tile{T, S}()
-    end
+@intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y)
+@intrinsic select(cond::Tile, x, y)
+function tfunc(::typeof(Intrinsics.select), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    cond_type = CC.widenconst(argtypes[2])
+    cond_type <: Tile ? CC.widenconst(argtypes[3]) : nothing
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args)
     cb = ctx.cb
@@ -947,10 +808,8 @@ end
 # These are codegen-only reinterpret intrinsics for map(f, tile).
 # to_scalar: jltype becomes scalar T (for overlay dispatch), but IR value stays shaped.
 # from_scalar: restores jltype to Tile{T, S}.
-@eval Intrinsics begin
-    @noinline to_scalar(tile::Tile{T, S}) where {T, S} = compilerbarrier(:type, nothing)
-    @noinline from_scalar(x::T, ::Type{S}) where {T, S} = Tile{T, S}()
-end
+@intrinsic to_scalar(tile)
+@intrinsic from_scalar(x, S)
 function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     T = CC.widenconst(argtypes[2])
diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl
index ded13df..8bd3e93 100644
--- a/src/compiler/intrinsics/math.jl
+++ b/src/compiler/intrinsics/math.jl
@@ -3,41 +3,29 @@
 ## Floating-point math
 
 # cuda_tile.ceil
-@eval Intrinsics begin
-    """Ceiling (round toward positive infinity). Compiled to cuda_tile.ceil."""
-    @noinline ceil(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline ceil(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic ceil(x)
+tfunc(::typeof(Intrinsics.ceil), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args)
     emit_unop!(ctx, args, encode_CeilOp!)
 end
 
 # cuda_tile.cos
-@eval Intrinsics begin
-    """Cosine. Compiled to cuda_tile.cos."""
-    @noinline cos(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline cos(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic cos(x)
+tfunc(::typeof(Intrinsics.cos), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args)
     emit_unop!(ctx, args, encode_CosOp!)
 end
 
 # cuda_tile.cosh
-@eval Intrinsics begin
-    """Hyperbolic cosine. Compiled to cuda_tile.cosh."""
-    @noinline cosh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline cosh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic cosh(x)
+tfunc(::typeof(Intrinsics.cosh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args)
     emit_unop!(ctx, args, encode_CosHOp!)
 end
 
 # cuda_tile.exp2
-@eval Intrinsics begin
-    """Base-2 exponential (2^x). Compiled to cuda_tile.exp2."""
-    @noinline exp2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline exp2(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic exp2(x, flush_to_zero=false)
+tfunc(::typeof(Intrinsics.exp2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
     cb = ctx.cb
 
@@ -52,11 +40,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
 end
 
 # cuda_tile.exp
-@eval Intrinsics begin
-    """Natural exponential (e^x). Compiled to cuda_tile.exp."""
-    @noinline exp(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline exp(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic exp(x)
+tfunc(::typeof(Intrinsics.exp), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
     cb = ctx.cb
 
@@ -69,21 +54,15 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
 end
 
 # cuda_tile.floor
-@eval Intrinsics begin
-    """Floor (round toward negative infinity). Compiled to cuda_tile.floor."""
-    @noinline floor(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline floor(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic floor(x)
+tfunc(::typeof(Intrinsics.floor), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args)
     emit_unop!(ctx, args, encode_FloorOp!)
 end
 
 # cuda_tile.fma
-@eval Intrinsics begin
-    """Fused multiply-add: a * b + c. Compiled to cuda_tile.fma."""
-    @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic fma(x, y, z)
+tfunc(::typeof(Intrinsics.fma), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
     cb = ctx.cb
 
@@ -99,11 +78,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
 end
 
 # cuda_tile.log2
-@eval Intrinsics begin
-    """Base-2 logarithm. Compiled to cuda_tile.log2."""
-    @noinline log2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline log2(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic log2(x)
+tfunc(::typeof(Intrinsics.log2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
     cb = ctx.cb
 
@@ -116,11 +92,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
 end
 
 # cuda_tile.log
-@eval Intrinsics begin
-    """Element-wise natural logarithm. Compiled to cuda_tile.log."""
-    @noinline log(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline log(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic log(x)
+tfunc(::typeof(Intrinsics.log), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
     cb = ctx.cb
 
@@ -133,49 +106,36 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
 end
 
 # cuda_tile.maxf
-@eval Intrinsics begin
-    @noinline maxf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x > y || isnan(x), x, y)
-    @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic maxf(x, y)
+tfunc(::typeof(Intrinsics.maxf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args)
     emit_binop!(ctx, args, encode_MaxFOp!)
 end
 
 # cuda_tile.minf
-@eval Intrinsics begin
-    @noinline minf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x < y || isnan(x), x, y)
-    @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic minf(x, y)
+tfunc(::typeof(Intrinsics.minf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args)
     emit_binop!(ctx, args, encode_MinFOp!)
 end
 
 # cuda_tile.pow
-@eval Intrinsics begin
-    """Element-wise power. Compiled to cuda_tile.pow."""
-    @noinline pow(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic pow(x, y)
+tfunc(::typeof(Intrinsics.pow), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args)
     emit_binop!(ctx, args, encode_PowOp!)
 end
 
 # cuda_tile.remf
-@eval Intrinsics begin
-    """Element-wise floating-point remainder. Compiled to cuda_tile.remf."""
-    @noinline remf(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic remf(x, y)
+tfunc(::typeof(Intrinsics.remf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args)
     emit_binop!(ctx, args, encode_RemFOp!)
 end
 
 # cuda_tile.rsqrt
-@eval Intrinsics begin
-    """Element-wise reciprocal square root. Compiled to cuda_tile.rsqrt."""
-    @noinline rsqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline rsqrt(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic rsqrt(x, flush_to_zero=false)
+tfunc(::typeof(Intrinsics.rsqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
     cb = ctx.cb
 
@@ -190,31 +150,22 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
 end
 
 # cuda_tile.sin
-@eval Intrinsics begin
-    """Element-wise sine. Compiled to cuda_tile.sin."""
-    @noinline sin(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline sin(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic sin(x)
+tfunc(::typeof(Intrinsics.sin), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args)
     emit_unop!(ctx, args, encode_SinOp!)
 end
 
 # cuda_tile.sinh
-@eval Intrinsics begin
-    """Element-wise hyperbolic sine. Compiled to cuda_tile.sinh."""
-    @noinline sinh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline sinh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic sinh(x)
+tfunc(::typeof(Intrinsics.sinh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args)
     emit_unop!(ctx, args, encode_SinHOp!)
 end
 
 # cuda_tile.sqrt
-@eval Intrinsics begin
-    """Element-wise square root. Compiled to cuda_tile.sqrt."""
-    @noinline sqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline sqrt(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic sqrt(x)
+tfunc(::typeof(Intrinsics.sqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
     cb = ctx.cb
 
@@ -227,21 +178,15 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
 end
 
 # cuda_tile.tan
-@eval Intrinsics begin
-    """Element-wise tangent. Compiled to cuda_tile.tan."""
-    @noinline tan(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline tan(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic tan(x)
+tfunc(::typeof(Intrinsics.tan), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args)
     emit_unop!(ctx, args, encode_TanOp!)
 end
 
 # cuda_tile.tanh
-@eval Intrinsics begin
-    """Element-wise hyperbolic tangent. Compiled to cuda_tile.tanh."""
-    @noinline tanh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline tanh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic tanh(x)
+tfunc(::typeof(Intrinsics.tanh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args)
     emit_unop!(ctx, args, encode_TanHOp!)
 end
diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl
index 4db4b46..f7bf9e5 100644
--- a/src/compiler/intrinsics/memory.jl
+++ b/src/compiler/intrinsics/memory.jl
@@ -3,23 +3,16 @@
 # TODO: cuda_tile.join_tokens
 
 # cuda_tile.load_ptr_tko
-@eval Intrinsics begin
-    """
-        load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing)
-
-    Load values from a tile of pointers.
-    If mask is provided, masked-out positions return the padding value.
-    Compiled to cuda_tile.load_ptr_tko.
-
-    Note: TMA (allow_tma) is not applicable for pointer-based loads as they
-    support irregular access patterns incompatible with TMA requirements.
-    """
-    @noinline function load_ptr_tko(ptrs::Tile{Ptr{T}, S},
-                                     latency::Union{Int, Nothing}=nothing,
-                                     mask::Union{Tile{Bool, S}, Nothing}=nothing,
-                                     padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S}
-        Tile{T, S}()
-    end
+@intrinsic load_ptr_tko(ptrs, latency=nothing, mask=nothing, padding=nothing)
+function tfunc(::typeof(Intrinsics.load_ptr_tko), argtypes::Vector{Any})
+    length(argtypes) >= 2 || return nothing
+    ptrs_type = CC.widenconst(argtypes[2])
+    ptrs_type <: Tile || return nothing
+    ptr_type = eltype(ptrs_type)
+    ptr_type <: Ptr || return nothing
+    T = eltype(ptr_type)
+    S = ptrs_type.parameters[2]
+    return Tile{T, S}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args)
     cb = ctx.cb
@@ -81,22 +74,10 @@ end
 # TODO: cuda_tile.make_token
 
 # cuda_tile.store_ptr_tko
-@eval Intrinsics begin
-    """
-        store_ptr_tko(ptrs, values, latency, mask=nothing)
-
-    Store values to a tile of pointers.
-    If mask is provided, masked-out positions are not written.
-    Compiled to cuda_tile.store_ptr_tko.
-
-    Note: TMA (allow_tma) is not applicable for pointer-based stores as they
-    support irregular access patterns incompatible with TMA requirements.
-    """
-    @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
-                                      latency::Union{Int, Nothing},
-                                      mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
-        nothing
-    end
+@intrinsic function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
+                                            latency::Union{Int, Nothing},
+                                            mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
+    nothing
 end
 efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl
index 19a8534..2a0a784 100644
--- a/src/compiler/intrinsics/misc.jl
+++ b/src/compiler/intrinsics/misc.jl
@@ -1,10 +1,8 @@
 # miscellaneous intrinsics
 
 # cuda_tile.assert
-@eval Intrinsics begin
-    @noinline function assert(cond::Bool, message::String)
-        nothing
-    end
+@intrinsic function assert(cond::Bool, message::String)
+    nothing
 end
 efunc(::typeof(Intrinsics.assert), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index 1c6e7c6..fd1bdde 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -24,17 +24,8 @@ function get_padding_value(ctx::CGCtx, args)
 end
 
 # cuda_tile.get_index_space_shape
-@eval Intrinsics begin
-    """
-        get_index_space_shape(pv::PartitionView, axis) -> Int32
-
-    Get the number of tiles along the given axis (0-indexed).
-    Compiled to cuda_tile.get_index_space_shape.
-    """
-    @noinline function get_index_space_shape(pv::PartitionView{T, N, Shape}, axis::Integer) where {T, N, Shape}
-        compilerbarrier(:const, zero(Int32))
-    end
-end
+@intrinsic get_index_space_shape(pv, axis)
+tfunc(::typeof(Intrinsics.get_index_space_shape), argtypes::Vector{Any}) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_index_space_shape), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -69,20 +60,7 @@ end
 # TODO: cuda_tile.get_tensor_shape
 
 # cuda_tile.load_view_tko
-@eval Intrinsics begin
-    """
-        load_partition_view(pv::PartitionView, latency, allow_tma, index...) -> Tile
-
-    Load a tile from a partition view at the given 0-indexed tile coordinates.
-    Compiled to cuda_tile.load_view_tko.
-    """
-    @noinline function load_partition_view(pv::PartitionView{T, N, Shape},
-                                            latency::Union{Int, Nothing},
-                                            allow_tma::Bool,
-                                            indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic load_partition_view(pv, latency, allow_tma, indices)
 function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any})
     length(argtypes) >= 2 || return nothing
     pv_type = CC.widenconst(argtypes[2])
@@ -172,19 +150,7 @@ function pad_indices(ctx::CGCtx, index_vals::Vector{Value}, ndim::Int, idx_type:
 end
 
 # cuda_tile.make_partition_view
-@eval Intrinsics begin
-    """
-        make_partition_view(tv::TensorView, shape_val, padding_mode, order) -> PartitionView
-
-    Create a PartitionView from a TensorView with the given tile shape.
-    The `order` parameter (NTuple{N,Int} or nothing) specifies
-    the logical-to-physical dimension mapping (1-indexed), or identity if nothing.
-    Compiled to cuda_tile.make_partition_view.
-    """
-    @noinline function make_partition_view(tv::TensorView{T, N}, shape::NTuple{M, Int}, padding_mode::Int, order) where {T, N, M}
-        compilerbarrier(:type, nothing)
-    end
-end
+@intrinsic make_partition_view(tv, shape, padding_mode, order)
 function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any})
     length(argtypes) >= 3 || return nothing
     tv_type = CC.widenconst(argtypes[2])
@@ -336,16 +302,8 @@ function filter_dynamic_strides(stride_vals::Vector{Value}, tv_strides::Vector{I
 end
 
 # cuda_tile.make_tensor_view
-@eval Intrinsics begin
-    """
-        make_tensor_view(arr::TileArray) -> TensorView
-
-    Create a TensorView from a TileArray.
-    Compiled to cuda_tile.make_tensor_view.
-    """
-    @noinline function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N}
-        TensorView{T, N}()
-    end
+@intrinsic function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N}
+    TensorView{T, N}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args)
     array_arg = args[1]
@@ -366,20 +324,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args
 end
 
 # cuda_tile.store_view_tko
-@eval Intrinsics begin
-    """
-        store_partition_view(pv::PartitionView, tile, latency, allow_tma, index...) -> Nothing
-
-    Store a tile to a partition view at the given 0-indexed tile coordinates.
-    Compiled to cuda_tile.store_view_tko.
-    """
-    @noinline function store_partition_view(pv::PartitionView{T, N, Shape},
-                                             tile::Tile{T},
-                                             latency::Union{Int, Nothing},
-                                             allow_tma::Bool,
-                                             indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        nothing
-    end
+@intrinsic function store_partition_view(pv::PartitionView{T, N, Shape},
+                                                   tile::Tile{T},
+                                                   latency::Union{Int, Nothing},
+                                                   allow_tma::Bool,
+                                                   indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
+    nothing
 end
 efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)

From 42391fb29c7a8468b9e5afc42398a5e3ff5ce765 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Mon, 9 Feb 2026 21:47:12 +0100
Subject: [PATCH 04/17] Detect when intrinsics are executed by the compiler.

---
 src/compiler/intrinsics.jl | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 4cb6d0a..9505bcf 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -4,7 +4,7 @@
 
 module Intrinsics
 
-using Base: compilerbarrier
+using Base: compilerbarrier, inferencebarrier
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
@@ -39,10 +39,16 @@ provide a correct scalar implementation using `Core.Intrinsics`, or return
 `nothing` for side-effect-only intrinsics.
 """
 macro intrinsic(ex)
-    if ex isa Expr && ex.head in (:function, :(=))
-        funcdef = combinedef(splitdef(ex))
+    funcdef = if ex isa Expr && ex.head in (:function, :(=))
+        combinedef(splitdef(ex))
     else
-        funcdef = Expr(:function, ex, quote compilerbarrier(:type, nothing) end)
+        body = quote
+            if inferencebarrier(true)::Bool
+                error("Intrinsic $(string(ex)) cannot be evaluated at compile time")
+            end
+            compilerbarrier(:type, nothing)
+        end
+        Expr(:function, ex, body)
     end
     funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef)
     return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef)))))

From 7178f62cd9768fce16404829233aa5f16e8f4c4a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 10 Feb 2026 09:36:58 +0100
Subject: [PATCH 05/17] Remove more intrinsics bodies and pass through lattice.

---
 src/compiler/interface.jl              |  11 +-
 src/compiler/intrinsics.jl             |  10 +-
 src/compiler/intrinsics/arithmetic.jl  | 169 ++++++++-----------------
 src/compiler/intrinsics/atomics.jl     |   6 +-
 src/compiler/intrinsics/conversions.jl |  38 +++++-
 src/compiler/intrinsics/core.jl        |  90 +++++--------
 src/compiler/intrinsics/math.jl        |  38 +++---
 src/compiler/intrinsics/memory.jl      |  14 +-
 src/compiler/intrinsics/misc.jl        |   5 +-
 src/compiler/intrinsics/views.jl       |  33 +++--
 10 files changed, 173 insertions(+), 241 deletions(-)

diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index 9080e87..f69fc36 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -79,7 +79,7 @@ CC.may_discard_trees(::cuTileInterpreter) = false
 
 # Per-intrinsic return type overrides.
 # Returns nothing when no override applies (fallback).
-tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing
+tfunc(𝕃, @nospecialize(f), @nospecialize args...) = nothing
 
 # Per-intrinsic effect overrides.
 # Returns nothing when no override applies (fallback).
@@ -179,7 +179,8 @@ end
             arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing},
             sv::CC.InferenceState, max_methods::Int)
         is_intr = isintrinsic(f)
-        rt_override = tfunc(f, arginfo.argtypes)
+        𝕃 = CC.typeinf_lattice(interp)
+        rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...)
         subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv)
         !is_intr && rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
@@ -211,7 +212,8 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.InferenceState, max_methods::Int)
         is_intr = isintrinsic(f)
-        rt_override = tfunc(f, arginfo.argtypes)
+        𝕃 = CC.typeinf_lattice(interp)
+        rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...)
         subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv)
         !is_intr && rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
@@ -244,7 +246,8 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
             sv::CC.AbsIntState, max_methods::Int)
         _infer_subprogram(interp, f, arginfo, si, nothing, sv)  # side-effect only
         is_intr = isintrinsic(f)
-        rt_override = tfunc(f, arginfo.argtypes)
+        𝕃 = CC.typeinf_lattice(interp)
+        rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...)
         rt = rt_override !== nothing ? rt_override : result.rt
         efunc_override = is_intr ? efunc(f, result.effects) : nothing
         effects = efunc_override !== nothing ? efunc_override : result.effects
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 9505bcf..df0fc48 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -12,12 +12,10 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal
 
 end
 
-# NOTE: Intrinsics are never directly folded (concrete_eval_eligible returns :none,
-#       nonoverlayed=ALWAYS_FALSE taints caller effects). However, overlay callers
-#       with @assume_effects :foldable override the propagated effects, causing the
-#       compiler to concrete-evaluate through intrinsic bodies (JuliaLang/julia#60583).
-#       Intrinsics on such paths need callable bodies (function definition form).
-#       All others use compilerbarrier(:type, nothing) as a dummy body (bare signature).
+# NOTE: Intrinsics use bare signatures with dummy bodies (compilerbarrier(:type, nothing)).
+#       Return types are provided by tfunc overrides in the interpreter.
+#       Const-prop for overlay callers happens via @assume_effects :foldable at the
+#       overlay level, not through intrinsic bodies.
 
 using ExprTools: splitdef, combinedef
 
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 861731b..05a0a2c 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -84,31 +84,24 @@ end
 ## Integer arithmetic
 
 # cuda_tile.absi
-@intrinsic absi(x::T) where {T<:Integer} =
-    ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x)
-@intrinsic absi(a::Tile)
-function tfunc(::typeof(Intrinsics.absi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+@intrinsic absi(x::Integer)
+@intrinsic absi(x::Tile{<:Integer})
+tfunc(𝕃, ::typeof(Intrinsics.absi), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absi), args)
     emit_unop!(ctx, args, encode_AbsIOp!)
 end
 
 # cuda_tile.addi
-@intrinsic addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y)
-@intrinsic addi(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.addi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+@intrinsic addi(x::T, y::T) where {T<:Integer}
+@intrinsic addi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.addi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args)
     emit_binop!(ctx, args, encode_AddIOp!)
 end
 
 # cuda_tile.cldi (ceiling division, toward positive infinity)
 @intrinsic cldi(x, y, s)
-tfunc(::typeof(Intrinsics.cldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.cldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingPositiveInf)
@@ -131,13 +124,13 @@ end
     end
 end
 @intrinsic cmpi(a::Tile, b::Tile, pred, s)
-function tfunc(::typeof(Intrinsics.cmpi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s))
+    t = CC.widenconst(x)
     if t <: Tile
         S = t.parameters[2]
         return Tile{Bool, S}
     end
-    return nothing
+    return Bool
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
     cb = ctx.cb
@@ -166,6 +159,8 @@ end
 @intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer}
     s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y)
 end
+@intrinsic divi(a::Tile, b::Tile, s)
+tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingZero)
@@ -173,7 +168,7 @@ end
 
 # cuda_tile.fldi (floor division, toward negative infinity)
 @intrinsic fldi(x, y, s)
-tfunc(::typeof(Intrinsics.fldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.fldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingNegativeInf)
@@ -185,25 +180,16 @@ end
     ifelse(lt, y, x)
 end
 @intrinsic maxi(a::Tile, b::Tile, s)
-function tfunc(::typeof(Intrinsics.maxi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_MaxIOp!; signedness)
 end
 
 # cuda_tile.mini
-@intrinsic function mini(x::T, y::T, s::Signedness) where {T<:Integer}
-    lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-    ifelse(lt, x, y)
-end
+@intrinsic mini(x::T, y::T, s::Signedness) where {T<:Integer}
 @intrinsic mini(a::Tile, b::Tile, s)
-function tfunc(::typeof(Intrinsics.mini), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.mini), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness"))
     emit_binop!(ctx, args, encode_MinIOp!; signedness)
@@ -212,42 +198,31 @@ end
 # cuda_tile.muli
 @intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
 @intrinsic muli(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.muli), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args)
     emit_binop!(ctx, args, encode_MulIOp!)
 end
 
 # cuda_tile.mulhii
-@intrinsic function mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
-    ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T
-end
+@intrinsic mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
 @intrinsic mulhii(a::Tile, b::Tile, s)
-function tfunc(::typeof(Intrinsics.mulhii), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.mulhii), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args)
     emit_binop!(ctx, args, encode_MulhiIOp!)
 end
 
 # cuda_tile.negi
-@intrinsic negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x)
+@intrinsic negi(x::T) where {T<:Integer}
 @intrinsic negi(a::Tile)
-function tfunc(::typeof(Intrinsics.negi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.negi), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args)
     emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone)
 end
 
 # cuda_tile.remi
-@intrinsic function remi(x::T, y::T, s::Signedness) where {T<:Integer}
-    s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y)
-end
+@intrinsic remi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic remi(a::Tile, b::Tile, s)
+tfunc(𝕃, ::typeof(Intrinsics.remi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_RemIOp!; signedness)
@@ -255,14 +230,16 @@ end
 
 # cuda_tile.shli
 @intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T)
+@intrinsic shli(a::Tile, b::Tile)
+tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args)
     emit_binop!(ctx, args, encode_ShLIOp!)
 end
 
 # cuda_tile.shri
-@intrinsic function shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
-    s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T)
-end
+@intrinsic shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
+@intrinsic shri(a::Tile, b::Tile, s)
+tfunc(𝕃, ::typeof(Intrinsics.shri), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness"))
     emit_binop!(ctx, args, encode_ShRIOp!; signedness)
@@ -271,10 +248,7 @@ end
 # cuda_tile.subi
 @intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
 @intrinsic subi(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.subi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args)
     emit_binop!(ctx, args, encode_SubIOp!)
 end
@@ -283,51 +257,31 @@ end
 ## Floating-point arithmetic
 
 # cuda_tile.absf
-@intrinsic absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x)
+@intrinsic absf(x::T) where {T<:AbstractFloat}
 @intrinsic absf(a::Tile)
-function tfunc(::typeof(Intrinsics.absf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.absf), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args)
     emit_unop!(ctx, args, encode_AbsFOp!)
 end
 
 # cuda_tile.addf
-@intrinsic addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y)
+@intrinsic addf(x::T, y::T) where {T<:AbstractFloat}
 @intrinsic addf(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.addf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.addf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args)
     emit_binop!(ctx, args, encode_AddFOp!)
 end
 
 # cuda_tile.cmpf
-@intrinsic function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
-    if pred === CmpLessThan
-        Core.Intrinsics.lt_float(x, y)
-    elseif pred === CmpLessThanOrEqual
-        Core.Intrinsics.le_float(x, y)
-    elseif pred === CmpGreaterThan
-        Core.Intrinsics.lt_float(y, x)
-    elseif pred === CmpGreaterThanOrEqual
-        Core.Intrinsics.le_float(y, x)
-    elseif pred === CmpEqual
-        Core.Intrinsics.eq_float(x, y)
-    else  # CmpNotEqual
-        Core.Intrinsics.ne_float(x, y)
-    end
-end
+@intrinsic cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
 @intrinsic cmpf(a::Tile, b::Tile, pred)
-function tfunc(::typeof(Intrinsics.cmpf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.cmpf), @nospecialize(x), @nospecialize(y), @nospecialize(pred))
+    t = CC.widenconst(x)
     if t <: Tile
         S = t.parameters[2]
         return Tile{Bool, S}
     end
-    return nothing
+    return Bool
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
     cb = ctx.cb
@@ -352,45 +306,33 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
 end
 
 # cuda_tile.divf
-@intrinsic divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y)
+@intrinsic divf(x::T, y::T) where {T<:AbstractFloat}
 @intrinsic divf(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.divf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.divf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args)
     emit_binop!(ctx, args, encode_DivFOp!)
 end
 
 # cuda_tile.mulf
-@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y)
+@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat}
 @intrinsic mulf(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.mulf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.mulf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args)
     emit_binop!(ctx, args, encode_MulFOp!)
 end
 
 # cuda_tile.negf
-@intrinsic negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x)
+@intrinsic negf(x::T) where {T<:AbstractFloat}
 @intrinsic negf(a::Tile)
-function tfunc(::typeof(Intrinsics.negf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.negf), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args)
     emit_unop!(ctx, args, encode_NegFOp!)
 end
 
 # cuda_tile.subf
-@intrinsic subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y)
+@intrinsic subf(x::T, y::T) where {T<:AbstractFloat}
 @intrinsic subf(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.subf), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.subf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args)
     emit_binop!(ctx, args, encode_SubFOp!)
 end
@@ -401,10 +343,7 @@ end
 # cuda_tile.andi
 @intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
 @intrinsic andi(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.andi), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -421,12 +360,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
 end
 
 # cuda_tile.ori
-@intrinsic ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y)
+@intrinsic ori(x::T, y::T) where {T<:Integer}
 @intrinsic ori(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.ori), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -443,12 +379,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
 end
 
 # cuda_tile.xori
-@intrinsic xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y)
+@intrinsic xori(x::T, y::T) where {T<:Integer}
 @intrinsic xori(a::Tile, b::Tile)
-function tfunc(::typeof(Intrinsics.xori), argtypes::Vector{Any})
-    t = CC.widenconst(argtypes[2])
-    t <: Tile ? t : nothing
-end
+tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index 79258fa..9c480bf 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -33,7 +33,7 @@ end
 # cuda_tile.atomic_cas_tko
 @intrinsic atomic_cas(array, index, expected, desired,
                       memory_order, memory_scope)
-tfunc(::typeof(Intrinsics.atomic_cas), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2]))
+tfunc(𝕃, ::typeof(Intrinsics.atomic_cas), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array))
 efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args)
@@ -161,7 +161,7 @@ end
 
 # cuda_tile.atomic_rmw_tko with XCHG
 @intrinsic atomic_xchg(array, index, val, memory_order, memory_scope)
-tfunc(::typeof(Intrinsics.atomic_xchg), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2]))
+tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array))
 efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args)
@@ -171,7 +171,7 @@ end
 # cuda_tile.atomic_rmw_tko with ADD
 @intrinsic atomic_add(array, index, val,
                       memory_order, memory_scope)
-tfunc(::typeof(Intrinsics.atomic_add), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2]))
+tfunc(𝕃, ::typeof(Intrinsics.atomic_add), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array))
 efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args)
diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 6aa879f..638b05d 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -6,6 +6,12 @@
 @intrinsic function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
     s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x)
 end
+function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
+    tgt = CC.widenconst(target_type)
+    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -24,8 +30,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args)
 end
 
 # cuda_tile.ftof (scalar float to float)
-@intrinsic function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
-    sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x)
+@intrinsic ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
+function tfunc(𝕃, ::typeof(Intrinsics.ftof), @nospecialize(x), @nospecialize(target_type))
+    tgt = CC.widenconst(target_type)
+    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args)
     cb = ctx.cb
@@ -44,8 +54,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args)
 end
 
 # cuda_tile.ftoi (scalar float to integer)
-@intrinsic function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
-    s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x)
+@intrinsic ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.ftoi), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
+    tgt = CC.widenconst(target_type)
+    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args)
     cb = ctx.cb
@@ -65,8 +79,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args)
 end
 
 # cuda_tile.itof (scalar integer to float)
-@intrinsic function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
-    s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x)
+@intrinsic itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
+function tfunc(𝕃, ::typeof(Intrinsics.itof), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
+    tgt = CC.widenconst(target_type)
+    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args)
     cb = ctx.cb
@@ -86,7 +104,13 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args)
 end
 
 # cuda_tile.trunci (scalar integer truncation)
-@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x)
+@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.trunci), @nospecialize(x), @nospecialize(target_type))
+    tgt = CC.widenconst(target_type)
+    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.trunci), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index abe3f34..cfbba8a 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -20,11 +20,10 @@ end
 
 # cuda_tile.broadcast
 @intrinsic broadcast(tile, shape)
-function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.broadcast), @nospecialize(tile), @nospecialize(shape_arg))
+    tile_type = CC.widenconst(tile)
     tile_type <: Tile || return nothing
-    shape_arg = argtypes[3]
+    shape_arg = shape_arg
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tile_type)
@@ -100,11 +99,9 @@ end
 
 # cuda_tile.cat
 @intrinsic cat(tiles, axis)
-function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tuple_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.cat), @nospecialize(tiles), @nospecialize(axis_arg))
+    tuple_type = CC.widenconst(tiles)
     tuple_type <: Tuple{Tile, Tile} || return nothing
-    axis_arg = argtypes[3]
     isa(axis_arg, CC.Const) || return nothing
     axis = axis_arg.val
     t1_type = tuple_type.parameters[1]
@@ -167,12 +164,10 @@ end
 
 # cuda_tile.constant
 @intrinsic constant(shape, value, T)
-function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any})
-    length(argtypes) >= 4 || return nothing
-    shape_arg = argtypes[2]
+function tfunc(𝕃, ::typeof(Intrinsics.constant), @nospecialize(shape_arg), @nospecialize(value), @nospecialize(type_arg_lat))
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
-    type_arg = CC.widenconst(argtypes[4])
+    type_arg = CC.widenconst(type_arg_lat)
     type_arg <: Type || return nothing
     T = type_arg.parameters[1]
     return Tile{T, Tuple{shape...}}
@@ -207,11 +202,9 @@ end
 
 # cuda_tile.extract
 @intrinsic extract(tile, index, shape)
-function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any})
-    length(argtypes) >= 4 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.extract), @nospecialize(tile_lat), @nospecialize(index), @nospecialize(shape_arg))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
-    shape_arg = argtypes[4]
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tile_type)
@@ -261,7 +254,7 @@ end
 
 # cuda_tile.get_num_tile_blocks
 @intrinsic get_num_tile_blocks(axis)
-tfunc(::typeof(Intrinsics.get_num_tile_blocks), argtypes::Vector{Any}) = Int32
+tfunc(𝕃, ::typeof(Intrinsics.get_num_tile_blocks), @nospecialize(axis)) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), args)
     axis = @something get_constant(ctx, args[1]) throw(IRError("get_num_tile_blocks() axis must be a compile-time constant"))
     axis in (0, 1, 2) || throw(IRError("get_num_tile_blocks() axis must be 0, 1, or 2, got $axis"))
@@ -274,7 +267,7 @@ end
 
 # cuda_tile.get_tile_block_id
 @intrinsic get_tile_block_id(axis)
-tfunc(::typeof(Intrinsics.get_tile_block_id), argtypes::Vector{Any}) = Int32
+tfunc(𝕃, ::typeof(Intrinsics.get_tile_block_id), @nospecialize(axis)) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_tile_block_id), args)
     axis = @something get_constant(ctx, args[1]) throw(IRError("get_tile_block_id() axis must be a compile-time constant"))
     axis in (0, 1, 2) || throw(IRError("get_tile_block_id() axis must be 0, 1, or 2, got $axis"))
@@ -290,12 +283,10 @@ end
 
 # cuda_tile.iota
 @intrinsic iota(shape, T)
-function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    shape_arg = argtypes[2]
+function tfunc(𝕃, ::typeof(Intrinsics.iota), @nospecialize(shape_arg), @nospecialize(type_arg_lat))
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
-    type_arg = CC.widenconst(argtypes[3])
+    type_arg = CC.widenconst(type_arg_lat)
     type_arg <: Type || return nothing
     T = type_arg.parameters[1]
     return Tile{T, Tuple{shape...}}
@@ -324,7 +315,7 @@ end
 
 # cuda_tile.mmaf, cuda_tile.mmai
 @intrinsic mma(a, b, acc)
-tfunc(::typeof(Intrinsics.mma), argtypes::Vector{Any}) = CC.widenconst(argtypes[4])
+tfunc(𝕃, ::typeof(Intrinsics.mma), @nospecialize(a), @nospecialize(b), @nospecialize(acc)) = CC.widenconst(acc)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args)
     cb = ctx.cb
 
@@ -343,11 +334,10 @@ end
 
 # cuda_tile.offset
 @intrinsic offset(base, offsets)
-function tfunc(::typeof(Intrinsics.offset), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    base_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.offset), @nospecialize(base), @nospecialize(offsets))
+    base_type = CC.widenconst(base)
     base_type <: Ptr || return nothing
-    offsets_type = CC.widenconst(argtypes[3])
+    offsets_type = CC.widenconst(offsets)
     offsets_type <: Tile || return nothing
     T = eltype(base_type)
     S = offsets_type.parameters[2]
@@ -397,11 +387,9 @@ end
 
 # cuda_tile.permute
 @intrinsic permute(tile, perm)
-function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.permute), @nospecialize(tile_lat), @nospecialize(perm_arg))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
-    perm_arg = argtypes[3]
     isa(perm_arg, CC.Const) || return nothing
     perm = perm_arg.val
     s = size(tile_type)
@@ -447,9 +435,8 @@ end
 
 # cuda_tile.transpose
 @intrinsic transpose(tile)
-function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.transpose), @nospecialize(tile_lat))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
     s = size(tile_type)
     isempty(s) && return nothing
@@ -484,11 +471,9 @@ end
 
 # cuda_tile.reduce
 @intrinsic reduce(tiles, axis, f, identities)
-function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tuple_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.reduce), @nospecialize(tiles), @nospecialize(axis_arg), @nospecialize args...)
+    tuple_type = CC.widenconst(tiles)
     tuple_type isa DataType && tuple_type <: Tuple || return nothing
-    axis_arg = argtypes[3]
     isa(axis_arg, CC.Const) || return nothing
     axis = axis_arg.val
     result_params = Any[]
@@ -615,11 +600,9 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer =
 
 # cuda_tile.reshape
 @intrinsic reshape(tile, shape)
-function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.reshape), @nospecialize(tile_lat), @nospecialize(shape_arg))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
-    shape_arg = argtypes[3]
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tile_type)
@@ -684,9 +667,8 @@ end
 
 # cuda_tile.scan
 @intrinsic scan(tiles, axis, f, identities, reverse=false)
-function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    tuple_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.scan), @nospecialize(tiles), @nospecialize args...)
+    tuple_type = CC.widenconst(tiles)
     tuple_type isa DataType && tuple_type <: Tuple || return nothing
     result_params = Any[]
     for p in tuple_type.parameters
@@ -784,10 +766,8 @@ end
 # cuda_tile.select
 @intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y)
 @intrinsic select(cond::Tile, x, y)
-function tfunc(::typeof(Intrinsics.select), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    cond_type = CC.widenconst(argtypes[2])
-    cond_type <: Tile ? CC.widenconst(argtypes[3]) : nothing
+function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y))
+    CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args)
     cb = ctx.cb
@@ -810,17 +790,15 @@ end
 # from_scalar: restores jltype to Tile{T, S}.
 @intrinsic to_scalar(tile)
 @intrinsic from_scalar(x, S)
-function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    T = CC.widenconst(argtypes[2])
-    shape_type = CC.widenconst(argtypes[3])
+function tfunc(𝕃, ::typeof(Intrinsics.from_scalar), @nospecialize(x), @nospecialize(S_lat))
+    T = CC.widenconst(x)
+    shape_type = CC.widenconst(S_lat)
     shape_type <: Type || return nothing
     S = shape_type.parameters[1]
     return Tile{T, S}
 end
-function tfunc(::typeof(Intrinsics.to_scalar), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.to_scalar), @nospecialize(tile_lat))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
     return eltype(tile_type)
 end
diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl
index 8bd3e93..1a35010 100644
--- a/src/compiler/intrinsics/math.jl
+++ b/src/compiler/intrinsics/math.jl
@@ -4,28 +4,28 @@
 
 # cuda_tile.ceil
 @intrinsic ceil(x)
-tfunc(::typeof(Intrinsics.ceil), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.ceil), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args)
     emit_unop!(ctx, args, encode_CeilOp!)
 end
 
 # cuda_tile.cos
 @intrinsic cos(x)
-tfunc(::typeof(Intrinsics.cos), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.cos), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args)
     emit_unop!(ctx, args, encode_CosOp!)
 end
 
 # cuda_tile.cosh
 @intrinsic cosh(x)
-tfunc(::typeof(Intrinsics.cosh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.cosh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args)
     emit_unop!(ctx, args, encode_CosHOp!)
 end
 
 # cuda_tile.exp2
 @intrinsic exp2(x, flush_to_zero=false)
-tfunc(::typeof(Intrinsics.exp2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.exp2), @nospecialize(x), @nospecialize args...) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
     cb = ctx.cb
 
@@ -41,7 +41,7 @@ end
 
 # cuda_tile.exp
 @intrinsic exp(x)
-tfunc(::typeof(Intrinsics.exp), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.exp), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
     cb = ctx.cb
 
@@ -55,14 +55,14 @@ end
 
 # cuda_tile.floor
 @intrinsic floor(x)
-tfunc(::typeof(Intrinsics.floor), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.floor), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args)
     emit_unop!(ctx, args, encode_FloorOp!)
 end
 
 # cuda_tile.fma
 @intrinsic fma(x, y, z)
-tfunc(::typeof(Intrinsics.fma), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.fma), @nospecialize(x), @nospecialize(y), @nospecialize(z)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
     cb = ctx.cb
 
@@ -79,7 +79,7 @@ end
 
 # cuda_tile.log2
 @intrinsic log2(x)
-tfunc(::typeof(Intrinsics.log2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.log2), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
     cb = ctx.cb
 
@@ -93,7 +93,7 @@ end
 
 # cuda_tile.log
 @intrinsic log(x)
-tfunc(::typeof(Intrinsics.log), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.log), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
     cb = ctx.cb
 
@@ -107,35 +107,35 @@ end
 
 # cuda_tile.maxf
 @intrinsic maxf(x, y)
-tfunc(::typeof(Intrinsics.maxf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.maxf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args)
     emit_binop!(ctx, args, encode_MaxFOp!)
 end
 
 # cuda_tile.minf
 @intrinsic minf(x, y)
-tfunc(::typeof(Intrinsics.minf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.minf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args)
     emit_binop!(ctx, args, encode_MinFOp!)
 end
 
 # cuda_tile.pow
 @intrinsic pow(x, y)
-tfunc(::typeof(Intrinsics.pow), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.pow), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args)
     emit_binop!(ctx, args, encode_PowOp!)
 end
 
 # cuda_tile.remf
 @intrinsic remf(x, y)
-tfunc(::typeof(Intrinsics.remf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.remf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args)
     emit_binop!(ctx, args, encode_RemFOp!)
 end
 
 # cuda_tile.rsqrt
 @intrinsic rsqrt(x, flush_to_zero=false)
-tfunc(::typeof(Intrinsics.rsqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.rsqrt), @nospecialize(x), @nospecialize args...) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
     cb = ctx.cb
 
@@ -151,21 +151,21 @@ end
 
 # cuda_tile.sin
 @intrinsic sin(x)
-tfunc(::typeof(Intrinsics.sin), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.sin), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args)
     emit_unop!(ctx, args, encode_SinOp!)
 end
 
 # cuda_tile.sinh
 @intrinsic sinh(x)
-tfunc(::typeof(Intrinsics.sinh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.sinh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args)
     emit_unop!(ctx, args, encode_SinHOp!)
 end
 
 # cuda_tile.sqrt
 @intrinsic sqrt(x)
-tfunc(::typeof(Intrinsics.sqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.sqrt), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
     cb = ctx.cb
 
@@ -179,14 +179,14 @@ end
 
 # cuda_tile.tan
 @intrinsic tan(x)
-tfunc(::typeof(Intrinsics.tan), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.tan), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args)
     emit_unop!(ctx, args, encode_TanOp!)
 end
 
 # cuda_tile.tanh
 @intrinsic tanh(x)
-tfunc(::typeof(Intrinsics.tanh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2])
+tfunc(𝕃, ::typeof(Intrinsics.tanh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args)
     emit_unop!(ctx, args, encode_TanHOp!)
 end
diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl
index f7bf9e5..d4d4f87 100644
--- a/src/compiler/intrinsics/memory.jl
+++ b/src/compiler/intrinsics/memory.jl
@@ -4,9 +4,8 @@
 
 # cuda_tile.load_ptr_tko
 @intrinsic load_ptr_tko(ptrs, latency=nothing, mask=nothing, padding=nothing)
-function tfunc(::typeof(Intrinsics.load_ptr_tko), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    ptrs_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.load_ptr_tko), @nospecialize(ptrs), @nospecialize args...)
+    ptrs_type = CC.widenconst(ptrs)
     ptrs_type <: Tile || return nothing
     ptr_type = eltype(ptrs_type)
     ptr_type <: Ptr || return nothing
@@ -74,11 +73,10 @@ end
 # TODO: cuda_tile.make_token
 
 # cuda_tile.store_ptr_tko
-@intrinsic function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
-                                            latency::Union{Int, Nothing},
-                                            mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
-    nothing
-end
+@intrinsic store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
+                                   latency::Union{Int, Nothing},
+                                   mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
+tfunc(𝕃, ::typeof(Intrinsics.store_ptr_tko), @nospecialize args...) = Nothing
 efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args)
diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl
index 2a0a784..fa1c4ba 100644
--- a/src/compiler/intrinsics/misc.jl
+++ b/src/compiler/intrinsics/misc.jl
@@ -1,9 +1,8 @@
 # miscellaneous intrinsics
 
 # cuda_tile.assert
-@intrinsic function assert(cond::Bool, message::String)
-    nothing
-end
+@intrinsic assert(cond::Bool, message::String)
+tfunc(𝕃, ::typeof(Intrinsics.assert), @nospecialize(cond), @nospecialize(message)) = Nothing
 efunc(::typeof(Intrinsics.assert), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.assert), args)
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index fd1bdde..fff19b1 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -25,7 +25,7 @@ end
 
 # cuda_tile.get_index_space_shape
 @intrinsic get_index_space_shape(pv, axis)
-tfunc(::typeof(Intrinsics.get_index_space_shape), argtypes::Vector{Any}) = Int32
+tfunc(𝕃, ::typeof(Intrinsics.get_index_space_shape), @nospecialize(pv), @nospecialize(axis)) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_index_space_shape), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -61,9 +61,8 @@ end
 
 # cuda_tile.load_view_tko
 @intrinsic load_partition_view(pv, latency, allow_tma, indices)
-function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    pv_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.load_partition_view), @nospecialize(pv), @nospecialize args...)
+    pv_type = CC.widenconst(pv)
     pv_type <: PartitionView || return nothing
     pv_type isa DataType || return nothing
     length(pv_type.parameters) >= 3 || return nothing
@@ -151,11 +150,9 @@ end
 
 # cuda_tile.make_partition_view
 @intrinsic make_partition_view(tv, shape, padding_mode, order)
-function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tv_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.make_partition_view), @nospecialize(tv), @nospecialize(shape_arg), @nospecialize args...)
+    tv_type = CC.widenconst(tv)
     tv_type <: TensorView || return nothing
-    shape_arg = argtypes[3]
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tv_type)
@@ -302,8 +299,11 @@ function filter_dynamic_strides(stride_vals::Vector{Value}, tv_strides::Vector{I
 end
 
 # cuda_tile.make_tensor_view
-@intrinsic function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N}
-    TensorView{T, N}()
+@intrinsic make_tensor_view(arr::TileArray{T, N}) where {T, N}
+function tfunc(𝕃, ::typeof(Intrinsics.make_tensor_view), @nospecialize(arr))
+    t = CC.widenconst(arr)
+    t <: TileArray || return nothing
+    TensorView{eltype(t), ndims(t)}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args)
     array_arg = args[1]
@@ -324,13 +324,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args
 end
 
 # cuda_tile.store_view_tko
-@intrinsic function store_partition_view(pv::PartitionView{T, N, Shape},
-                                                   tile::Tile{T},
-                                                   latency::Union{Int, Nothing},
-                                                   allow_tma::Bool,
-                                                   indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-    nothing
-end
+@intrinsic store_partition_view(pv::PartitionView{T, N, Shape},
+                                          tile::Tile{T},
+                                          latency::Union{Int, Nothing},
+                                          allow_tma::Bool,
+                                          indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
+tfunc(𝕃, ::typeof(Intrinsics.store_partition_view), @nospecialize args...) = Nothing
 efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) =
     CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), args)

From 6979d0bb221c435001993a8647cdcc36616cd1d0 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 10 Feb 2026 12:15:22 +0100
Subject: [PATCH 06/17] Restrict intrinsics.

---
 src/compiler/intrinsics/arithmetic.jl | 48 +++++++++++-----------
 src/compiler/intrinsics/core.jl       |  4 +-
 src/compiler/intrinsics/math.jl       | 57 ++++++++++++++++++---------
 3 files changed, 65 insertions(+), 44 deletions(-)

diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 05a0a2c..79c33c9 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -100,7 +100,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args)
 end
 
 # cuda_tile.cldi (ceiling division, toward positive infinity)
-@intrinsic cldi(x, y, s)
+@intrinsic cldi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic cldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.cldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness"))
@@ -123,7 +124,7 @@ end
         Core.Intrinsics.ne_int(x, y)
     end
 end
-@intrinsic cmpi(a::Tile, b::Tile, pred, s)
+@intrinsic cmpi(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
 function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s))
     t = CC.widenconst(x)
     if t <: Tile
@@ -159,7 +160,7 @@ end
 @intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer}
     s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y)
 end
-@intrinsic divi(a::Tile, b::Tile, s)
+@intrinsic divi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness"))
@@ -167,7 +168,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
 end
 
 # cuda_tile.fldi (floor division, toward negative infinity)
-@intrinsic fldi(x, y, s)
+@intrinsic fldi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic fldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.fldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness"))
@@ -179,7 +181,7 @@ end
     lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
     ifelse(lt, y, x)
 end
-@intrinsic maxi(a::Tile, b::Tile, s)
+@intrinsic maxi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness"))
@@ -188,7 +190,7 @@ end
 
 # cuda_tile.mini
 @intrinsic mini(x::T, y::T, s::Signedness) where {T<:Integer}
-@intrinsic mini(a::Tile, b::Tile, s)
+@intrinsic mini(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.mini), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness"))
@@ -197,7 +199,7 @@ end
 
 # cuda_tile.muli
 @intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
-@intrinsic muli(a::Tile, b::Tile)
+@intrinsic muli(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args)
     emit_binop!(ctx, args, encode_MulIOp!)
@@ -205,7 +207,7 @@ end
 
 # cuda_tile.mulhii
 @intrinsic mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
-@intrinsic mulhii(a::Tile, b::Tile, s)
+@intrinsic mulhii(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.mulhii), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args)
     emit_binop!(ctx, args, encode_MulhiIOp!)
@@ -213,7 +215,7 @@ end
 
 # cuda_tile.negi
 @intrinsic negi(x::T) where {T<:Integer}
-@intrinsic negi(a::Tile)
+@intrinsic negi(a::Tile{<:Integer})
 tfunc(𝕃, ::typeof(Intrinsics.negi), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args)
     emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone)
@@ -221,7 +223,7 @@ end
 
 # cuda_tile.remi
 @intrinsic remi(x::T, y::T, s::Signedness) where {T<:Integer}
-@intrinsic remi(a::Tile, b::Tile, s)
+@intrinsic remi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.remi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness"))
@@ -230,7 +232,7 @@ end
 
 # cuda_tile.shli
 @intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T)
-@intrinsic shli(a::Tile, b::Tile)
+@intrinsic shli(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args)
     emit_binop!(ctx, args, encode_ShLIOp!)
@@ -238,7 +240,7 @@ end
 
 # cuda_tile.shri
 @intrinsic shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
-@intrinsic shri(a::Tile, b::Tile, s)
+@intrinsic shri(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.shri), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness"))
@@ -247,7 +249,7 @@ end
 
 # cuda_tile.subi
 @intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
-@intrinsic subi(a::Tile, b::Tile)
+@intrinsic subi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args)
     emit_binop!(ctx, args, encode_SubIOp!)
@@ -258,7 +260,7 @@ end
 
 # cuda_tile.absf
 @intrinsic absf(x::T) where {T<:AbstractFloat}
-@intrinsic absf(a::Tile)
+@intrinsic absf(a::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.absf), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args)
     emit_unop!(ctx, args, encode_AbsFOp!)
@@ -266,7 +268,7 @@ end
 
 # cuda_tile.addf
 @intrinsic addf(x::T, y::T) where {T<:AbstractFloat}
-@intrinsic addf(a::Tile, b::Tile)
+@intrinsic addf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.addf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args)
     emit_binop!(ctx, args, encode_AddFOp!)
@@ -274,7 +276,7 @@ end
 
 # cuda_tile.cmpf
 @intrinsic cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
-@intrinsic cmpf(a::Tile, b::Tile, pred)
+@intrinsic cmpf(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate) where {T<:AbstractFloat}
 function tfunc(𝕃, ::typeof(Intrinsics.cmpf), @nospecialize(x), @nospecialize(y), @nospecialize(pred))
     t = CC.widenconst(x)
     if t <: Tile
@@ -307,7 +309,7 @@ end
 
 # cuda_tile.divf
 @intrinsic divf(x::T, y::T) where {T<:AbstractFloat}
-@intrinsic divf(a::Tile, b::Tile)
+@intrinsic divf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.divf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args)
     emit_binop!(ctx, args, encode_DivFOp!)
@@ -315,7 +317,7 @@ end
 
 # cuda_tile.mulf
 @intrinsic mulf(x::T, y::T) where {T<:AbstractFloat}
-@intrinsic mulf(a::Tile, b::Tile)
+@intrinsic mulf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.mulf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args)
     emit_binop!(ctx, args, encode_MulFOp!)
@@ -323,7 +325,7 @@ end
 
 # cuda_tile.negf
 @intrinsic negf(x::T) where {T<:AbstractFloat}
-@intrinsic negf(a::Tile)
+@intrinsic negf(a::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.negf), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args)
     emit_unop!(ctx, args, encode_NegFOp!)
@@ -331,7 +333,7 @@ end
 
 # cuda_tile.subf
 @intrinsic subf(x::T, y::T) where {T<:AbstractFloat}
-@intrinsic subf(a::Tile, b::Tile)
+@intrinsic subf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.subf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args)
     emit_binop!(ctx, args, encode_SubFOp!)
@@ -342,7 +344,7 @@ end
 
 # cuda_tile.andi
 @intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
-@intrinsic andi(a::Tile, b::Tile)
+@intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
     cb = ctx.cb
@@ -361,7 +363,7 @@ end
 
 # cuda_tile.ori
 @intrinsic ori(x::T, y::T) where {T<:Integer}
-@intrinsic ori(a::Tile, b::Tile)
+@intrinsic ori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
     cb = ctx.cb
@@ -380,7 +382,7 @@ end
 
 # cuda_tile.xori
 @intrinsic xori(x::T, y::T) where {T<:Integer}
-@intrinsic xori(a::Tile, b::Tile)
+@intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
     cb = ctx.cb
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index cfbba8a..53b949c 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -314,7 +314,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args)
 end
 
 # cuda_tile.mmaf, cuda_tile.mmai
-@intrinsic mma(a, b, acc)
+@intrinsic mma(a::Tile, b::Tile, acc::Tile)
 tfunc(𝕃, ::typeof(Intrinsics.mma), @nospecialize(a), @nospecialize(b), @nospecialize(acc)) = CC.widenconst(acc)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args)
     cb = ctx.cb
@@ -765,7 +765,7 @@ end
 
 # cuda_tile.select
 @intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y)
-@intrinsic select(cond::Tile, x, y)
+@intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T}
 function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y))
     CC.widenconst(x)
 end
diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl
index 1a35010..519cf1e 100644
--- a/src/compiler/intrinsics/math.jl
+++ b/src/compiler/intrinsics/math.jl
@@ -3,28 +3,32 @@
 ## Floating-point math
 
 # cuda_tile.ceil
-@intrinsic ceil(x)
+@intrinsic ceil(x::AbstractFloat)
+@intrinsic ceil(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.ceil), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args)
     emit_unop!(ctx, args, encode_CeilOp!)
 end
 
 # cuda_tile.cos
-@intrinsic cos(x)
+@intrinsic cos(x::AbstractFloat)
+@intrinsic cos(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.cos), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args)
     emit_unop!(ctx, args, encode_CosOp!)
 end
 
 # cuda_tile.cosh
-@intrinsic cosh(x)
+@intrinsic cosh(x::AbstractFloat)
+@intrinsic cosh(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.cosh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args)
     emit_unop!(ctx, args, encode_CosHOp!)
 end
 
 # cuda_tile.exp2
-@intrinsic exp2(x, flush_to_zero=false)
+@intrinsic exp2(x::AbstractFloat, flush_to_zero::Bool=false)
+@intrinsic exp2(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false)
 tfunc(𝕃, ::typeof(Intrinsics.exp2), @nospecialize(x), @nospecialize args...) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
     cb = ctx.cb
@@ -40,7 +44,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
 end
 
 # cuda_tile.exp
-@intrinsic exp(x)
+@intrinsic exp(x::AbstractFloat)
+@intrinsic exp(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.exp), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
     cb = ctx.cb
@@ -54,14 +59,16 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
 end
 
 # cuda_tile.floor
-@intrinsic floor(x)
+@intrinsic floor(x::AbstractFloat)
+@intrinsic floor(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.floor), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args)
     emit_unop!(ctx, args, encode_FloorOp!)
 end
 
 # cuda_tile.fma
-@intrinsic fma(x, y, z)
+@intrinsic fma(x::T, y::T, z::T) where {T<:AbstractFloat}
+@intrinsic fma(x::Tile{T}, y::Tile{T}, z::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.fma), @nospecialize(x), @nospecialize(y), @nospecialize(z)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
     cb = ctx.cb
@@ -78,7 +85,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
 end
 
 # cuda_tile.log2
-@intrinsic log2(x)
+@intrinsic log2(x::AbstractFloat)
+@intrinsic log2(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.log2), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
     cb = ctx.cb
@@ -92,7 +100,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
 end
 
 # cuda_tile.log
-@intrinsic log(x)
+@intrinsic log(x::AbstractFloat)
+@intrinsic log(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.log), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
     cb = ctx.cb
@@ -106,35 +115,40 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
 end
 
 # cuda_tile.maxf
-@intrinsic maxf(x, y)
+@intrinsic maxf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic maxf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.maxf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args)
     emit_binop!(ctx, args, encode_MaxFOp!)
 end
 
 # cuda_tile.minf
-@intrinsic minf(x, y)
+@intrinsic minf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic minf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.minf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args)
     emit_binop!(ctx, args, encode_MinFOp!)
 end
 
 # cuda_tile.pow
-@intrinsic pow(x, y)
+@intrinsic pow(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic pow(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.pow), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args)
     emit_binop!(ctx, args, encode_PowOp!)
 end
 
 # cuda_tile.remf
-@intrinsic remf(x, y)
+@intrinsic remf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic remf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
 tfunc(𝕃, ::typeof(Intrinsics.remf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args)
     emit_binop!(ctx, args, encode_RemFOp!)
 end
 
 # cuda_tile.rsqrt
-@intrinsic rsqrt(x, flush_to_zero=false)
+@intrinsic rsqrt(x::AbstractFloat, flush_to_zero::Bool=false)
+@intrinsic rsqrt(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false)
 tfunc(𝕃, ::typeof(Intrinsics.rsqrt), @nospecialize(x), @nospecialize args...) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
     cb = ctx.cb
@@ -150,21 +164,24 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
 end
 
 # cuda_tile.sin
-@intrinsic sin(x)
+@intrinsic sin(x::AbstractFloat)
+@intrinsic sin(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.sin), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args)
     emit_unop!(ctx, args, encode_SinOp!)
 end
 
 # cuda_tile.sinh
-@intrinsic sinh(x)
+@intrinsic sinh(x::AbstractFloat)
+@intrinsic sinh(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.sinh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args)
     emit_unop!(ctx, args, encode_SinHOp!)
 end
 
 # cuda_tile.sqrt
-@intrinsic sqrt(x)
+@intrinsic sqrt(x::AbstractFloat)
+@intrinsic sqrt(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.sqrt), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
     cb = ctx.cb
@@ -178,14 +195,16 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
 end
 
 # cuda_tile.tan
-@intrinsic tan(x)
+@intrinsic tan(x::AbstractFloat)
+@intrinsic tan(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.tan), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args)
     emit_unop!(ctx, args, encode_TanOp!)
 end
 
 # cuda_tile.tanh
-@intrinsic tanh(x)
+@intrinsic tanh(x::AbstractFloat)
+@intrinsic tanh(x::Tile{<:AbstractFloat})
 tfunc(𝕃, ::typeof(Intrinsics.tanh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args)
     emit_unop!(ctx, args, encode_TanHOp!)

From ac32212fa4a9fbdf2e1ce713972b0d0bd8ff5463 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 10 Feb 2026 21:40:22 +0100
Subject: [PATCH 07/17] Get ifelse working by avoiding the error.

---
 src/compiler/intrinsics.jl      | 3 ---
 src/compiler/intrinsics/core.jl | 5 ++++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index df0fc48..37c4b4b 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -41,9 +41,6 @@ macro intrinsic(ex)
         combinedef(splitdef(ex))
     else
         body = quote
-            if inferencebarrier(true)::Bool
-                error("Intrinsic $(string(ex)) cannot be evaluated at compile time")
-            end
             compilerbarrier(:type, nothing)
         end
         Expr(:function, ex, body)
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 53b949c..346ea0d 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -764,9 +764,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
 end
 
 # cuda_tile.select
-@intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y)
+@intrinsic select(cond::Bool, x::T, y::T) where {T}# = Core.ifelse(cond, x, y)
 @intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T}
 function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y))
+    if cond isa CC.Const
+        return cond.val === true ? x : y
+    end
     CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args)

From 08b208fbebe4354a106f03e913db7bba54c20338 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 10 Feb 2026 21:59:59 +0100
Subject: [PATCH 08/17] Remove more intrinsic bodies.

---
 src/compiler/intrinsics.jl             | 10 +++-----
 src/compiler/intrinsics/arithmetic.jl  | 33 ++++++--------------------
 src/compiler/intrinsics/conversions.jl |  4 +---
 3 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 37c4b4b..ab6d02c 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -37,14 +37,10 @@ provide a correct scalar implementation using `Core.Intrinsics`, or return
 `nothing` for side-effect-only intrinsics.
 """
 macro intrinsic(ex)
-    funcdef = if ex isa Expr && ex.head in (:function, :(=))
-        combinedef(splitdef(ex))
-    else
-        body = quote
-            compilerbarrier(:type, nothing)
-        end
-        Expr(:function, ex, body)
+    body = quote
+        compilerbarrier(:type, nothing)
     end
+    funcdef = Expr(:function, ex, body)
     funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef)
     return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef)))))
 end
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 79c33c9..9af46d0 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -109,21 +109,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args)
 end
 
 # cuda_tile.cmpi
-@intrinsic function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
-    if pred === CmpLessThan
-        s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-    elseif pred === CmpLessThanOrEqual
-        s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y)
-    elseif pred === CmpGreaterThan
-        s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x)
-    elseif pred === CmpGreaterThanOrEqual
-        s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x)
-    elseif pred === CmpEqual
-        Core.Intrinsics.eq_int(x, y)
-    else  # CmpNotEqual
-        Core.Intrinsics.ne_int(x, y)
-    end
-end
+@intrinsic cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
 @intrinsic cmpi(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
 function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s))
     t = CC.widenconst(x)
@@ -157,9 +143,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
 end
 
 # cuda_tile.divi (truncating division, toward zero)
-@intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer}
-    s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y)
-end
+@intrinsic divi(x::T, y::T, s::Signedness) where {T<:Integer}
 @intrinsic divi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
@@ -177,10 +161,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args)
 end
 
 # cuda_tile.maxi
-@intrinsic function maxi(x::T, y::T, s::Signedness) where {T<:Integer}
-    lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-    ifelse(lt, y, x)
-end
+@intrinsic maxi(x::T, y::T, s::Signedness) where {T<:Integer}
 @intrinsic maxi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
@@ -198,7 +179,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
 end
 
 # cuda_tile.muli
-@intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
+@intrinsic muli(x::T, y::T) where {T<:Integer}
 @intrinsic muli(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args)
@@ -231,7 +212,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args)
 end
 
 # cuda_tile.shli
-@intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T)
+@intrinsic shli(x::T, y::Integer) where {T<:Integer}
 @intrinsic shli(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args)
@@ -248,7 +229,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args)
 end
 
 # cuda_tile.subi
-@intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
+@intrinsic subi(x::T, y::T) where {T<:Integer}
 @intrinsic subi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args)
@@ -343,7 +324,7 @@ end
 ## Boolean arithmetic
 
 # cuda_tile.andi
-@intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
+@intrinsic andi(x::T, y::T) where {T<:Integer}
 @intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
 tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 638b05d..409bff1 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -3,9 +3,7 @@
 # TODO: cuda_tile.bitcast
 
 # cuda_tile.exti (scalar integer extension)
-@intrinsic function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
-    s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x)
-end
+@intrinsic exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
 function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
     tgt = CC.widenconst(target_type)
     T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing

From ca66ae76a95cdd8101586588efa7483308b50326 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 09:09:22 +0100
Subject: [PATCH 09/17] Remove ExprTools.

---
 Project.toml               | 12 +++++-------
 src/compiler/intrinsics.jl |  2 --
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/Project.toml b/Project.toml
index cdff353..5323854 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,10 +8,9 @@ projects = ["test", "examples"]
 
 [deps]
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8"
 CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d"
-ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93"
 
 [weakdeps]
@@ -19,18 +18,17 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c"
 
 [sources]
-CompilerCaching = {url = "https://github.com/maleadt/CompilerCaching.jl", rev="main"}
-IRStructurizer = {url = "https://github.com/maleadt/IRStructurizer.jl", rev = "main"}
+CompilerCaching = {rev = "main", url = "https://github.com/maleadt/CompilerCaching.jl"}
+IRStructurizer = {rev = "main", url = "https://github.com/maleadt/IRStructurizer.jl"}
 
 [extensions]
 CUDAExt = "CUDA"
 DLFP8TypesExt = "DLFP8Types"
 
 [compat]
-julia = "1.11"
 BFloat16s = "0.6"
-CompilerCaching = "0.1"
 CUDA_Compiler_jll = "0.4"
 CUDA_Tile_jll = "13.1"
-ExprTools = "0.1"
+CompilerCaching = "0.1"
 IRStructurizer = "0.1"
+julia = "1.11"
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index ab6d02c..56cce9b 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -17,8 +17,6 @@ end
 #       Const-prop for overlay callers happens via @assume_effects :foldable at the
 #       overlay level, not through intrinsic bodies.
 
-using ExprTools: splitdef, combinedef
-
 """
     @intrinsic signature
     @intrinsic function_definition

From 5a686d898542d59c501dea58c390bd71d1832134 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 09:46:45 +0100
Subject: [PATCH 10/17] Port tfunc improvements from Julia.

---
 src/compiler/intrinsics.jl             | 18 ++++++++++++++++-
 src/compiler/intrinsics/arithmetic.jl  | 18 +++++++++++++++--
 src/compiler/intrinsics/conversions.jl | 20 +++++++++----------
 src/compiler/intrinsics/core.jl        | 27 ++++++++++++++------------
 4 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 56cce9b..c1d27aa 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -43,7 +43,21 @@ macro intrinsic(ex)
     return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef)))))
 end
 
-emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
+"""
+    instanceof_tfunc(lat) -> Type or nothing
+
+Extract `T` from a lattice element representing `Type{T}`.
+Simplified version of `Base.Compiler.instanceof_tfunc` that handles `Const(T)`
+and `Type{T}` lattice elements. Returns `nothing` when `T` cannot be determined.
+"""
+function instanceof_tfunc(@nospecialize(lat))
+    if isa(lat, CC.Const)
+        val = lat.val
+        return val isa Type ? val : nothing
+    end
+    tgt = CC.widenconst(lat)
+    return tgt isa DataType && tgt <: Type && !isempty(tgt.parameters) ? tgt.parameters[1] : nothing
+end
 
 # Shared helper for creating load/store optimization hints
 function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, allow_tma::Bool=true)
@@ -53,6 +67,8 @@ function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, all
     return make_load_store_hints(ctx.sm_arch, hints)
 end
 
+emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
+
 include("intrinsics/core.jl")
 include("intrinsics/conversions.jl")
 include("intrinsics/arithmetic.jl")
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 9af46d0..3aba1c6 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -326,7 +326,14 @@ end
 # cuda_tile.andi
 @intrinsic andi(x::T, y::T) where {T<:Integer}
 @intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
-tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
+function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y))
+    if isa(x, CC.Const) && x.val === false && CC.widenconst(y) === Bool
+        return CC.Const(false)
+    elseif isa(y, CC.Const) && y.val === false && CC.widenconst(x) === Bool
+        return CC.Const(false)
+    end
+    return CC.widenconst(x)
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -345,7 +352,14 @@ end
 # cuda_tile.ori
 @intrinsic ori(x::T, y::T) where {T<:Integer}
 @intrinsic ori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
-tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
+function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y))
+    if isa(x, CC.Const) && x.val === true && CC.widenconst(y) === Bool
+        return CC.Const(true)
+    elseif isa(y, CC.Const) && y.val === true && CC.widenconst(x) === Bool
+        return CC.Const(true)
+    end
+    return CC.widenconst(x)
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 409bff1..e302063 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -5,8 +5,8 @@
 # cuda_tile.exti (scalar integer extension)
 @intrinsic exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
 function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
-    tgt = CC.widenconst(target_type)
-    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
     src = CC.widenconst(x)
     src <: Tile ? similar_type(src, T) : T
 end
@@ -30,8 +30,8 @@ end
 # cuda_tile.ftof (scalar float to float)
 @intrinsic ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
 function tfunc(𝕃, ::typeof(Intrinsics.ftof), @nospecialize(x), @nospecialize(target_type))
-    tgt = CC.widenconst(target_type)
-    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
     src = CC.widenconst(x)
     src <: Tile ? similar_type(src, T) : T
 end
@@ -54,8 +54,8 @@ end
 # cuda_tile.ftoi (scalar float to integer)
 @intrinsic ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
 function tfunc(𝕃, ::typeof(Intrinsics.ftoi), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
-    tgt = CC.widenconst(target_type)
-    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
     src = CC.widenconst(x)
     src <: Tile ? similar_type(src, T) : T
 end
@@ -79,8 +79,8 @@ end
 # cuda_tile.itof (scalar integer to float)
 @intrinsic itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
 function tfunc(𝕃, ::typeof(Intrinsics.itof), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
-    tgt = CC.widenconst(target_type)
-    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
     src = CC.widenconst(x)
     src <: Tile ? similar_type(src, T) : T
 end
@@ -104,8 +104,8 @@ end
 # cuda_tile.trunci (scalar integer truncation)
 @intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer}
 function tfunc(𝕃, ::typeof(Intrinsics.trunci), @nospecialize(x), @nospecialize(target_type))
-    tgt = CC.widenconst(target_type)
-    T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
     src = CC.widenconst(x)
     src <: Tile ? similar_type(src, T) : T
 end
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 346ea0d..306d13a 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -167,9 +167,8 @@ end
 function tfunc(𝕃, ::typeof(Intrinsics.constant), @nospecialize(shape_arg), @nospecialize(value), @nospecialize(type_arg_lat))
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
-    type_arg = CC.widenconst(type_arg_lat)
-    type_arg <: Type || return nothing
-    T = type_arg.parameters[1]
+    T = instanceof_tfunc(type_arg_lat)
+    T === nothing && return nothing
     return Tile{T, Tuple{shape...}}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.constant), args)
@@ -286,9 +285,8 @@ end
 function tfunc(𝕃, ::typeof(Intrinsics.iota), @nospecialize(shape_arg), @nospecialize(type_arg_lat))
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
-    type_arg = CC.widenconst(type_arg_lat)
-    type_arg <: Type || return nothing
-    T = type_arg.parameters[1]
+    T = instanceof_tfunc(type_arg_lat)
+    T === nothing && return nothing
     return Tile{T, Tuple{shape...}}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args)
@@ -767,10 +765,16 @@ end
 @intrinsic select(cond::Bool, x::T, y::T) where {T}# = Core.ifelse(cond, x, y)
 @intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T}
 function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y))
-    if cond isa CC.Const
-        return cond.val === true ? x : y
+    if isa(cond, CC.Const)
+        if cond.val === true
+            return x
+        elseif cond.val === false
+            return y
+        else
+            return Union{}
+        end
     end
-    CC.widenconst(x)
+    return CC.tmerge(𝕃, x, y)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args)
     cb = ctx.cb
@@ -795,9 +799,8 @@ end
 @intrinsic from_scalar(x, S)
 function tfunc(𝕃, ::typeof(Intrinsics.from_scalar), @nospecialize(x), @nospecialize(S_lat))
     T = CC.widenconst(x)
-    shape_type = CC.widenconst(S_lat)
-    shape_type <: Type || return nothing
-    S = shape_type.parameters[1]
+    S = instanceof_tfunc(S_lat)
+    S === nothing && return nothing
     return Tile{T, S}
 end
 function tfunc(𝕃, ::typeof(Intrinsics.to_scalar), @nospecialize(tile_lat))

From 335dd1404761d215c59c657a46f7ef019fe59dbb Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 09:59:18 +0100
Subject: [PATCH 11/17] Fix docstring.

---
 src/compiler/intrinsics.jl | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index c1d27aa..5275036 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -12,27 +12,12 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal
 
 end
 
-# NOTE: Intrinsics use bare signatures with dummy bodies (compilerbarrier(:type, nothing)).
-#       Return types are provided by tfunc overrides in the interpreter.
-#       Const-prop for overlay callers happens via @assume_effects :foldable at the
-#       overlay level, not through intrinsic bodies.
-
 """
     @intrinsic signature
-    @intrinsic function_definition
-
-Define a Tile IR intrinsic in the `Intrinsics` module.
-
-A bare signature (e.g. `@intrinsic foo(x)`) creates a dummy body using
-`compilerbarrier(:type, nothing)` so body inference returns `Any`. Actual
-return types come from `tfunc` overrides in the interpreter.
 
-A function definition (e.g. `@intrinsic foo(x) = expr`) preserves the body,
-providing a callable implementation for concrete evaluation. This is needed
-when overlay callers with `@assume_effects :foldable` cause the compiler to
-evaluate through intrinsic bodies (JuliaLang/julia#60583). The body should
-provide a correct scalar implementation using `Core.Intrinsics`, or return
-`nothing` for side-effect-only intrinsics.
+Define a Tile IR intrinsic in the `Intrinsics` module. These intrinsics are
+defined to return `Any`, so need additional `tfunc` and `efunc` definitions
+to specify their behavior.
 """
 macro intrinsic(ex)
     body = quote

From 6a93f0c8b61dff60bc223c0edea4cfe29730305a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 10:17:28 +0100
Subject: [PATCH 12/17] Simplify.

---
 src/compiler/interface.jl | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index f69fc36..6935a75 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -192,11 +192,6 @@ end
             rt = rt_override !== nothing ? rt_override : cm.rt
             efunc_override = is_intr ? efunc(f, cm.effects) : nothing
             effects = efunc_override !== nothing ? efunc_override : cm.effects
-            # Mark intrinsics as non-consistently-overlayed so callers can't be
-            # concrete-eval'd (not_callable() bodies would throw at runtime).
-            if is_intr
-                effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE)
-            end
             info = is_intr ? CC.NoCallInfo() : cm.info
             info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
             wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
@@ -225,11 +220,6 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
             rt = rt_override !== nothing ? rt_override : cm.rt
             efunc_override = is_intr ? efunc(f, cm.effects) : nothing
             effects = efunc_override !== nothing ? efunc_override : cm.effects
-            # Mark intrinsics as non-consistently-overlayed so callers can't be
-            # concrete-eval'd (not_callable() bodies would throw at runtime).
-            if is_intr
-                effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE)
-            end
             info = is_intr ? CC.NoCallInfo() : cm.info
             info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
             wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
@@ -251,11 +241,6 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
         rt = rt_override !== nothing ? rt_override : result.rt
         efunc_override = is_intr ? efunc(f, result.effects) : nothing
         effects = efunc_override !== nothing ? efunc_override : result.effects
-        # Mark intrinsics as non-consistently-overlayed so callers can't be
-        # concrete-eval'd (not_callable() bodies would throw at runtime).
-        if is_intr
-            effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE)
-        end
         info = is_intr ? CC.NoCallInfo() : result.info
         if is_intr || rt_override !== nothing
             return CC.CallMeta(rt, result.exct, effects, info)

From ca6e92342ab9c414d69021864469ab9d123b1ced Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 10:18:30 +0100
Subject: [PATCH 13/17] Use released versions of packages.

---
 Project.toml      | 4 ----
 test/Project.toml | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5323854..6f344ac 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,10 +17,6 @@ IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c"
 
-[sources]
-CompilerCaching = {rev = "main", url = "https://github.com/maleadt/CompilerCaching.jl"}
-IRStructurizer = {rev = "main", url = "https://github.com/maleadt/IRStructurizer.jl"}
-
 [extensions]
 CUDAExt = "CUDA"
 DLFP8TypesExt = "DLFP8Types"
diff --git a/test/Project.toml b/test/Project.toml
index 278b9d8..bd30c97 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,9 +8,6 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-[sources]
-FileCheck = {url = "https://github.com/JuliaLLVM/FileCheck.jl", rev = "main"}
-
 [compat]
 FileCheck = "1.0"
 ParallelTestRunner = "2.0"

From 09bb7e3ba8064ed870e4d4c5b04c7489a18b80ea Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 11:14:35 +0100
Subject: [PATCH 14/17] Handle literal SSA statements in emit_statement!

When Julia's optimizer constant-folds an SSA statement (via concrete eval,
SROA, constant propagation), it becomes a bare literal value instead of an
Expr(:call, ...). The else-branch in emit_statement! previously just warned
and discarded these, so the SSA slot was never registered in ctx.values,
causing "SSAValue not found" crashes for any downstream reference.

Delegate to emit_constant!/emit_value! for literal values, mirroring the
existing pattern in emit_rhs!. Add regression test using crafted IR (the
optimizer propagates constants too aggressively for a source-level repro).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/compiler/codegen/statements.jl |  9 +++-
 test/codegen/integration.jl        | 68 ++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/src/compiler/codegen/statements.jl b/src/compiler/codegen/statements.jl
index 13b8c60..d073c08 100644
--- a/src/compiler/codegen/statements.jl
+++ b/src/compiler/codegen/statements.jl
@@ -26,9 +26,14 @@ function emit_statement!(ctx::CGCtx, @nospecialize(stmt), ssa_idx::Int, @nospeci
         # PiNode is a type narrowing assertion - store the resolved value
         tv = emit_value!(ctx, stmt)
     elseif stmt === nothing
-        # No-op
+        # Dead code elimination artifact — no value to register
     else
-        @warn "Unhandled statement type" typeof(stmt) stmt
+        # Literal values from constant folding or concrete eval.
+        # Try emit_constant! first (numbers/ghost types), fall back to emit_value!.
+        tv = emit_constant!(ctx, stmt, result_type)
+        if tv === nothing
+            tv = emit_value!(ctx, stmt)
+        end
     end
 
     # Store result by original Julia SSA index
diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl
index a1503dc..91e1c20 100644
--- a/test/codegen/integration.jl
+++ b/test/codegen/integration.jl
@@ -1119,3 +1119,71 @@ end
         end
     end
 end
+
+#=============================================================================
+ Literal SSA Statement Handling
+=============================================================================#
+
+@testset "Literal SSA statements" begin
+    # Regression test: when Julia's optimizer constant-folds an SSA statement
+    # (via concrete eval, SROA, constant propagation), the statement becomes a
+    # bare literal value instead of an Expr(:call, ...). emit_statement! must
+    # register a CGVal for these so downstream SSAValue references resolve.
+    #
+    # Strategy: compile a real kernel, replace one Expr with a literal in the
+    # IRCode (simulating constant folding), then verify codegen succeeds.
+
+    spec = ct.ArraySpec{1}(16, true)
+
+    function _literal_test_kernel(a::ct.TileArray{Float32,1})
+        pid = ct.bid(1)
+        tile = ct.load(a, pid, (16,))
+        ct.store(a, pid, tile)
+        return
+    end
+
+    function _find_intrinsic_call(ir, callee)
+        for (i, inst) in enumerate(ir.stmts)
+            stmt = inst[:stmt]
+            if stmt isa Expr && stmt.head === :call && length(stmt.args) >= 1
+                if stmt.args[1] === callee
+                    return i
+                end
+            end
+        end
+        return nothing
+    end
+
+    function _test_literal_ssa(value)
+        argtypes = Tuple{ct.TileArray{Float32,1,spec}}
+        world = Base.get_world_counter()
+        mi = something(
+            ct.method_instance(_literal_test_kernel, argtypes; world,
+                               method_table=ct.cuTileMethodTable),
+            ct.method_instance(_literal_test_kernel, argtypes; world))
+        ir, _ = ct.code_ircode(mi)
+
+        # Replace first subi(pid, 1) with a literal — simulates constant folding
+        idx = _find_intrinsic_call(ir, ct.Intrinsics.subi)
+        @assert idx !== nothing "test setup: could not find subi call in IR"
+        ir.stmts[idx][:stmt] = value
+
+        sci = ct.StructuredIRCode(ir)
+        bytecode = ct.write_bytecode!(1) do writer, func_buf
+            ct.emit_kernel!(writer, func_buf, sci, Nothing;
+                name="literal_test",
+                cache=ct.CacheView{ct.CuTileResults}(
+                    (:cuTile, (sm_arch=nothing, opt_level=3,
+                               num_ctas=nothing, occupancy=nothing)), world))
+        end
+        return length(bytecode) > 0
+    end
+
+    @testset "Int32 zero literal" begin
+        @test _test_literal_ssa(Int32(0))
+    end
+
+    @testset "Int32 nonzero literal" begin
+        @test _test_literal_ssa(Int32(42))
+    end
+end

From 8aa32eefa5f6f64915d67675ef076904ee70e4e1 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 11:28:37 +0100
Subject: [PATCH 15/17] Emit false constant for :boundscheck expressions in
 codegen

Previously, emit_expr! returned nothing for Expr(:boundscheck), so the
SSA slot was never registered. When the IR structurizer created an IfOp
whose condition referenced that SSA, emit_value! crashed with "SSAValue
not found". This happens when concrete_eval_eligible doesn't block
semi-concrete eval, causing @boundscheck blocks from tuple indexing in
the One() adapter to survive to codegen.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/compiler/codegen/expressions.jl |  4 +-
 test/codegen/integration.jl         | 57 +++++++++++++++++++++++++----
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl
index 9abe616..61474c2 100644
--- a/src/compiler/codegen/expressions.jl
+++ b/src/compiler/codegen/expressions.jl
@@ -17,7 +17,9 @@ function emit_expr!(ctx::CGCtx, expr::Expr, @nospecialize(result_type))
     elseif expr.head === :foreigncall
         throw(IRError("Foreign calls not supported in Tile IR"))
     elseif expr.head === :boundscheck
-        return nothing
+        # Bounds checking is always disabled in Tile IR kernels.
+        # Emit false so IfOps referencing this SSA can resolve the condition.
+        return emit_constant!(ctx, false, Bool)
     else
         @warn "Unhandled expression head" expr.head expr
         return nothing
diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl
index 91e1c20..546e6e9 100644
--- a/test/codegen/integration.jl
+++ b/test/codegen/integration.jl
@@ -1124,14 +1124,10 @@ end
  Literal SSA Statement Handling
 =============================================================================#
 
-@testset "Literal SSA statements" begin
-    # Regression test: when Julia's optimizer constant-folds an SSA statement
-    # (via concrete eval, SROA, constant propagation), the statement becomes a
-    # bare literal value instead of an Expr(:call, ...). emit_statement! must
-    # register a CGVal for these so downstream SSAValue references resolve.
-    #
-    # Strategy: compile a real kernel, replace one Expr with a literal in the
-    # IRCode (simulating constant folding), then verify codegen succeeds.
+@testset "Statement emission edge cases" begin
+    # Regression tests: certain IR statement forms (literal values from constant
+    # folding, :boundscheck expressions from inlined @boundscheck blocks) must
+    # register CGVals so downstream SSAValue references resolve.
 
     spec = ct.ArraySpec{1}(16, true)
 
@@ -1186,4 +1182,49 @@ end
     @testset "Int32 nonzero literal" begin
         @test _test_literal_ssa(Int32(42))
     end
+
+    @testset "Expr(:boundscheck) registers CGVal" begin
+        # Regression test: Expr(:boundscheck) must emit a Bool constant so that
+        # downstream SSAValue references (e.g., IfOp conditions) can resolve.
+        # Previously, emit_expr! returned nothing for :boundscheck, leaving the
+        # SSA slot unregistered and causing "SSAValue not found" crashes.
+        #
+        # Strategy: inject Expr(:boundscheck) at the subi position and replace
+        # the downstream reference with a constant so codegen completes cleanly.
+        argtypes = Tuple{ct.TileArray{Float32,1,spec}}
+        world = Base.get_world_counter()
+        mi = something(
+            ct.method_instance(_literal_test_kernel, argtypes; world,
+                               method_table=ct.cuTileMethodTable),
+            ct.method_instance(_literal_test_kernel, argtypes; world))
+        ir, _ = ct.code_ircode(mi)
+
+        # Replace first subi with Expr(:boundscheck) — simulates inlined @boundscheck
+        idx = _find_intrinsic_call(ir, ct.Intrinsics.subi)
+        @assert idx !== nothing "test setup: could not find subi call in IR"
+        ir.stmts[idx][:stmt] = Expr(:boundscheck)
+        ir.stmts[idx][:type] = Bool
+        # Fix downstream: replace the SSAValue reference to subi with a constant
+        # so the load_view doesn't fail on a Bool argument
+        for i in (idx+1):length(ir.stmts)
+            stmt = ir.stmts[i][:stmt]
+            if stmt isa Expr
+                for (j, arg) in enumerate(stmt.args)
+                    if arg === Core.SSAValue(idx)
+                        stmt.args[j] = Int32(0)
+                    end
+                end
+            end
+        end
+
+        sci = ct.StructuredIRCode(ir)
+        bytecode = ct.write_bytecode!(1) do writer, func_buf
+            ct.emit_kernel!(writer, func_buf, sci, Nothing;
+                name="boundscheck_test",
+                cache=ct.CacheView{ct.CuTileResults}(
+                    (:cuTile, (sm_arch=nothing, opt_level=3,
+                               num_ctas=nothing, occupancy=nothing)), world))
+        end
+        @test length(bytecode) > 0
+    end
 end

From 60fa5bbfdb3ff6482b39025d5337b0e502b842d6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 11:33:52 +0100
Subject: [PATCH 16/17] Re-enable concrete eval for intrinsics.

---
 src/compiler/interface.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index 6935a75..86f8f8b 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -250,7 +250,6 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
 end
 
 # Disable semi-concrete interpretation (broken with overlays per JuliaLang/julia#47349)
-# and block concrete eval for intrinsics (not_callable() bodies return dummy values).
 function CC.concrete_eval_eligible(interp::cuTileInterpreter,
     @nospecialize(f), result::CC.MethodCallResult, arginfo::CC.ArgInfo, sv::CC.InferenceState)
     ret = @invoke CC.concrete_eval_eligible(interp::CC.AbstractInterpreter,
@@ -258,9 +257,6 @@ function CC.concrete_eval_eligible(interp::cuTileInterpreter,
     if ret === :semi_concrete_eval
         return :none
     end
-    if ret === :concrete_eval && isintrinsic(f)
-        return :none
-    end
     return ret
 end
 

From ccd6572771b3c8107a0b510212f7837f95093921 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 11 Feb 2026 11:36:43 +0100
Subject: [PATCH 17/17] Remove fragile tests.

---
 test/codegen/integration.jl | 109 ------------------------------------
 1 file changed, 109 deletions(-)

diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl
index 546e6e9..a1503dc 100644
--- a/test/codegen/integration.jl
+++ b/test/codegen/integration.jl
@@ -1119,112 +1119,3 @@ end
         end
     end
 end
-
-#=============================================================================
- Literal SSA Statement Handling
-=============================================================================#
-
-@testset "Statement emission edge cases" begin
-    # Regression tests: certain IR statement forms (literal values from constant
-    # folding, :boundscheck expressions from inlined @boundscheck blocks) must
-    # register CGVals so downstream SSAValue references resolve.
-
-    spec = ct.ArraySpec{1}(16, true)
-
-    function _literal_test_kernel(a::ct.TileArray{Float32,1})
-        pid = ct.bid(1)
-        tile = ct.load(a, pid, (16,))
-        ct.store(a, pid, tile)
-        return
-    end
-
-    function _find_intrinsic_call(ir, callee)
-        for (i, inst) in enumerate(ir.stmts)
-            stmt = inst[:stmt]
-            if stmt isa Expr && stmt.head === :call && length(stmt.args) >= 1
-                if stmt.args[1] === callee
-                    return i
-                end
-            end
-        end
-        return nothing
-    end
-
-    function _test_literal_ssa(value)
-        argtypes = Tuple{ct.TileArray{Float32,1,spec}}
-        world = Base.get_world_counter()
-        mi = something(
-            ct.method_instance(_literal_test_kernel, argtypes; world,
-                               method_table=ct.cuTileMethodTable),
-            ct.method_instance(_literal_test_kernel, argtypes; world))
-        ir, _ = ct.code_ircode(mi)
-
-        # Replace first subi(pid, 1) with a literal — simulates constant folding
-        idx = _find_intrinsic_call(ir, ct.Intrinsics.subi)
-        @assert idx !== nothing "test setup: could not find subi call in IR"
-        ir.stmts[idx][:stmt] = value
-
-        sci = ct.StructuredIRCode(ir)
-        bytecode = ct.write_bytecode!(1) do writer, func_buf
-            ct.emit_kernel!(writer, func_buf, sci, Nothing;
-                name="literal_test",
-                cache=ct.CacheView{ct.CuTileResults}(
-                    (:cuTile, (sm_arch=nothing, opt_level=3,
-                               num_ctas=nothing, occupancy=nothing)), world))
-        end
-        return length(bytecode) > 0
-    end
-
-    @testset "Int32 zero literal" begin
-        @test _test_literal_ssa(Int32(0))
-    end
-
-    @testset "Int32 nonzero literal" begin
-        @test _test_literal_ssa(Int32(42))
-    end
-
-    @testset "Expr(:boundscheck) registers CGVal" begin
-        # Regression test: Expr(:boundscheck) must emit a Bool constant so that
-        # downstream SSAValue references (e.g., IfOp conditions) can resolve.
-        # Previously, emit_expr! returned nothing for :boundscheck, leaving the
-        # SSA slot unregistered and causing "SSAValue not found" crashes.
-        #
-        # Strategy: inject Expr(:boundscheck) at the subi position and replace
-        # the downstream reference with a constant so codegen completes cleanly.
-        argtypes = Tuple{ct.TileArray{Float32,1,spec}}
-        world = Base.get_world_counter()
-        mi = something(
-            ct.method_instance(_literal_test_kernel, argtypes; world,
-                               method_table=ct.cuTileMethodTable),
-            ct.method_instance(_literal_test_kernel, argtypes; world))
-        ir, _ = ct.code_ircode(mi)
-
-        # Replace first subi with Expr(:boundscheck) — simulates inlined @boundscheck
-        idx = _find_intrinsic_call(ir, ct.Intrinsics.subi)
-        @assert idx !== nothing "test setup: could not find subi call in IR"
-        ir.stmts[idx][:stmt] = Expr(:boundscheck)
-        ir.stmts[idx][:type] = Bool
-        # Fix downstream: replace the SSAValue reference to subi with a constant
-        # so the load_view doesn't fail on a Bool argument
-        for i in (idx+1):length(ir.stmts)
-            stmt = ir.stmts[i][:stmt]
-            if stmt isa Expr
-                for (j, arg) in enumerate(stmt.args)
-                    if arg === Core.SSAValue(idx)
-                        stmt.args[j] = Int32(0)
-                    end
-                end
-            end
-        end
-
-        sci = ct.StructuredIRCode(ir)
-        bytecode = ct.write_bytecode!(1) do writer, func_buf
-            ct.emit_kernel!(writer, func_buf, sci, Nothing;
-                name="boundscheck_test",
-                cache=ct.CacheView{ct.CuTileResults}(
-                    (:cuTile, (sm_arch=nothing, opt_level=3,
-                               num_ctas=nothing, occupancy=nothing)), world))
-        end
-        @test length(bytecode) > 0
-    end
-end