diff --git a/Project.toml b/Project.toml
index dd1c4ea..6f344ac 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,27 +8,23 @@ projects = ["test", "examples"]
 
 [deps]
 BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8"
 CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d"
+CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd"
 IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c"
 
-[sources]
-CompilerCaching = {url = "https://github.com/maleadt/CompilerCaching.jl", rev="main"}
-IRStructurizer = {url = "https://github.com/maleadt/IRStructurizer.jl", rev = "main"}
-
 [extensions]
 CUDAExt = "CUDA"
 DLFP8TypesExt = "DLFP8Types"
 
 [compat]
-julia = "1.11"
 BFloat16s = "0.6"
-CompilerCaching = "0.1"
 CUDA_Compiler_jll = "0.4"
 CUDA_Tile_jll = "13.1"
+CompilerCaching = "0.1"
 IRStructurizer = "0.1"
+julia = "1.11"
diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl
index 02b7c38..61474c2 100644
--- a/src/compiler/codegen/expressions.jl
+++ b/src/compiler/codegen/expressions.jl
@@ -17,7 +17,9 @@ function emit_expr!(ctx::CGCtx, expr::Expr, @nospecialize(result_type))
     elseif expr.head === :foreigncall
         throw(IRError("Foreign calls not supported in Tile IR"))
     elseif expr.head === :boundscheck
-        return nothing
+        # Bounds checking is always disabled in Tile IR kernels.
+        # Emit false so IfOps referencing this SSA can resolve the condition.
+        return emit_constant!(ctx, false, Bool)
     else
         @warn "Unhandled expression head" expr.head expr
         return nothing
@@ -79,9 +81,7 @@ function emit_call!(ctx::CGCtx, expr::Expr, @nospecialize(result_type))
     func = get_constant(ctx, args[1])
     call_args = args[2:end]
 
-    # TODO: This is normally dynamic dispatch, which we should allow.
-    #       However, we currently trigger this when emitting Julia intrinsics.
-    #       We should switch to our own intrinsics entirely, which are only invoked.
+    # We enter here for dynamic dispatch, but also for all intrinsic functions.
 
     @static if isdefined(Core, :throw_methoderror)
         if func === Core.throw_methoderror
diff --git a/src/compiler/codegen/statements.jl b/src/compiler/codegen/statements.jl
index 13b8c60..d073c08 100644
--- a/src/compiler/codegen/statements.jl
+++ b/src/compiler/codegen/statements.jl
@@ -26,9 +26,14 @@ function emit_statement!(ctx::CGCtx, @nospecialize(stmt), ssa_idx::Int, @nospeci
         # PiNode is a type narrowing assertion - store the resolved value
         tv = emit_value!(ctx, stmt)
     elseif stmt === nothing
-        # No-op
+        # Dead code elimination artifact — no value to register
     else
-        @warn "Unhandled statement type" typeof(stmt) stmt
+        # Literal values from constant folding or concrete eval.
+        # Try emit_constant! first (numbers/ghost types), fall back to emit_value!.
+        tv = emit_constant!(ctx, stmt, result_type)
+        if tv === nothing
+            tv = emit_value!(ctx, stmt)
+        end
     end
 
     # Store result by original Julia SSA index
diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index bdc6e8a..86f8f8b 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -74,14 +74,20 @@ CC.may_compress(::cuTileInterpreter) = true
 CC.may_discard_trees(::cuTileInterpreter) = false
 
 #=============================================================================
- Custom return-type inference (tfuncs) for intrinsics
+ Custom inference for intrinsics
 =============================================================================#
 
-# Per-intrinsic return type overrides using multiple dispatch.
+# Per-intrinsic return type overrides.
 # Returns nothing when no override applies (fallback).
-# Concrete per-intrinsic methods are defined in intrinsics/ (after the
-# Intrinsics module exists).
-tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing
+tfunc(𝕃, @nospecialize(f), @nospecialize args...) = nothing
+
+# Per-intrinsic effect overrides.
+# Returns nothing when no override applies (fallback).
+efunc(@nospecialize(f), effects::CC.Effects) = nothing
+
+# Predicate for functions defined in the Intrinsics module.
+# These get NoCallInfo() so they stay as Expr(:call) rather than Expr(:invoke).
+isintrinsic(@nospecialize(f)) = isa(f, Function) && parentmodule(f) === Intrinsics
 
 #=============================================================================
  Subprogram inference for reduce/scan
@@ -172,9 +178,11 @@ end
         result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any,
             arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing},
             sv::CC.InferenceState, max_methods::Int)
-        rt_override = tfunc(f, arginfo.argtypes)
+        is_intr = isintrinsic(f)
+        𝕃 = CC.typeinf_lattice(interp)
+        rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...)
         subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv)
-        rt_override === nothing && subprog === nothing && return result
+        !is_intr && rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
         push!(sv.tasks, function (interp′, sv′)
             isready(result) || return false
@@ -182,8 +190,11 @@ end
             cm = result[]
             sp = subprog !== nothing ? subprog[] : nothing
             rt = rt_override !== nothing ? rt_override : cm.rt
-            info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info
-            wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements)
+            efunc_override = is_intr ? efunc(f, cm.effects) : nothing
+            effects = efunc_override !== nothing ? efunc_override : cm.effects
+            info = is_intr ? CC.NoCallInfo() : cm.info
+            info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
+            wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
             return true
         end)
         return wrapped
@@ -195,9 +206,11 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
         result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any,
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.InferenceState, max_methods::Int)
-        rt_override = tfunc(f, arginfo.argtypes)
+        is_intr = isintrinsic(f)
+        𝕃 = CC.typeinf_lattice(interp)
+        rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...)
         subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv)
-        rt_override === nothing && subprog === nothing && return result
+        !is_intr && rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
         push!(sv.tasks, function (interp′, sv′)
             isready(result) || return false
@@ -205,8 +218,11 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
             cm = result[]
             sp = subprog !== nothing ? subprog[] : nothing
             rt = rt_override !== nothing ? rt_override : cm.rt
-            info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info
-            wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements)
+            efunc_override = is_intr ? efunc(f, cm.effects) : nothing
+            effects = efunc_override !== nothing ? efunc_override : cm.effects
+            info = is_intr ? CC.NoCallInfo() : cm.info
+            info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info
+            wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements)
             return true
         end)
         return wrapped
@@ -219,10 +235,15 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.AbsIntState, max_methods::Int)
         _infer_subprogram(interp, f, arginfo, si, nothing, sv)  # side-effect only
-        rt_override = tfunc(f, arginfo.argtypes)
-        if rt_override !== nothing
-            return CC.CallMeta(rt_override, result.exct, result.effects,
-                               result.info)
+        is_intr = isintrinsic(f)
+        𝕃 = CC.typeinf_lattice(interp)
+        rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...)
+        rt = rt_override !== nothing ? rt_override : result.rt
+        efunc_override = is_intr ? efunc(f, result.effects) : nothing
+        effects = efunc_override !== nothing ? efunc_override : result.effects
+        info = is_intr ? CC.NoCallInfo() : result.info
+        if is_intr || rt_override !== nothing
+            return CC.CallMeta(rt, result.exct, effects, info)
         end
         return result
     end
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index aa0d425..5275036 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -4,7 +4,7 @@
 
 module Intrinsics
 
-using Base: compilerbarrier, donotdelete
+using Base: compilerbarrier, inferencebarrier
 using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView
 using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned
 using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual
@@ -12,24 +12,37 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal
 
 end
 
-# NOTE: Due to JuliaLang/julia#60583, intrinsics may be called during constant evaluation.
-#       Because of that, such intrinsics (such as basic arithmetic) need to provide an
-#       implementation that actually computes a valid result using Julia intrinsics.
-#
-#       Sometimes that's not possible, e.g., because the functionality required for that is
-#       overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those
-#       intrinsics we disable constant folding using a `compilerbarrier(:const)`
-#
-# NOTE: Side-effectful intrinsics (stores, atomics) use `donotdelete(args...)` in their
-#       bodies to prevent the optimizer from DCE'ing calls. `donotdelete` is a Julia builtin
-#       with `effect_free=ALWAYS_FALSE`, which inference propagates through the function body.
-#       `@assume_effects !:effect_free` does NOT work — `override_effects` can only strengthen
-#       effects (set ALWAYS_TRUE), not weaken them. Spoofing `ipo_effects` via a custom
-#       `CC.finish!` override is possible but fragile (must race against `finishinfer!` setting
-#       `use_const_api` based on pre-override effects). `donotdelete` is the simplest correct
-#       approach.
+"""
+    @intrinsic signature
 
-emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
+Define a Tile IR intrinsic in the `Intrinsics` module. These intrinsics are
+defined to return `Any`, so need additional `tfunc` and `efunc` definitions
+to specify their behavior.
+"""
+macro intrinsic(ex)
+    body = quote
+        compilerbarrier(:type, nothing)
+    end
+    funcdef = Expr(:function, ex, body)
+    funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef)
+    return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef)))))
+end
+
+"""
+    instanceof_tfunc(lat) -> Type or nothing
+
+Extract `T` from a lattice element representing `Type{T}`.
+Simplified version of `Base.Compiler.instanceof_tfunc` that handles `Const(T)`
+and `Type{T}` lattice elements. Returns `nothing` when `T` cannot be determined.
+"""
+function instanceof_tfunc(@nospecialize(lat))
+    if isa(lat, CC.Const)
+        val = lat.val
+        return val isa Type ? val : nothing
+    end
+    tgt = CC.widenconst(lat)
+    return tgt isa DataType && tgt <: Type && !isempty(tgt.parameters) ? tgt.parameters[1] : nothing
+end
 
 # Shared helper for creating load/store optimization hints
 function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, allow_tma::Bool=true)
@@ -39,6 +52,8 @@ function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, all
     return make_load_store_hints(ctx.sm_arch, hints)
 end
 
+emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
+
 include("intrinsics/core.jl")
 include("intrinsics/conversions.jl")
 include("intrinsics/arithmetic.jl")
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 6272251..3aba1c6 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -84,53 +84,40 @@ end
 ## Integer arithmetic
 
 # cuda_tile.absi
-@eval Intrinsics begin
-    """Integer absolute value. Compiled to cuda_tile.absi."""
-    @noinline absi(x::T) where {T<:Integer} =
-        ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x)
-    @noinline absi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a)
-end
+@intrinsic absi(x::Integer)
+@intrinsic absi(x::Tile{<:Integer})
+tfunc(𝕃, ::typeof(Intrinsics.absi), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absi), args)
     emit_unop!(ctx, args, encode_AbsIOp!)
 end
 
 # cuda_tile.addi
-@eval Intrinsics begin
-    @noinline addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y)
-    @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
-end
+@intrinsic addi(x::T, y::T) where {T<:Integer}
+@intrinsic addi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.addi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args)
     emit_binop!(ctx, args, encode_AddIOp!)
 end
 
 # cuda_tile.cldi (ceiling division, toward positive infinity)
-@eval Intrinsics begin
-    @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T))
-end
+@intrinsic cldi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic cldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.cldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingPositiveInf)
 end
 
 # cuda_tile.cmpi
-@eval Intrinsics begin
-    @noinline function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
-        if pred === CmpLessThan
-            s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-        elseif pred === CmpLessThanOrEqual
-            s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y)
-        elseif pred === CmpGreaterThan
-            s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x)
-        elseif pred === CmpGreaterThanOrEqual
-            s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x)
-        elseif pred === CmpEqual
-            Core.Intrinsics.eq_int(x, y)
-        else  # CmpNotEqual
-            Core.Intrinsics.ne_int(x, y)
-        end
+@intrinsic cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
+@intrinsic cmpi(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate, s::Signedness) where {T<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s))
+    t = CC.widenconst(x)
+    if t <: Tile
+        S = t.parameters[2]
+        return Tile{Bool, S}
     end
-    @noinline cmpi(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate, ::Signedness) where {T<:Integer, S} =
-        Tile{Bool, S}()
+    return Bool
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
     cb = ctx.cb
@@ -156,118 +143,95 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
 end
 
 # cuda_tile.divi (truncating division, toward zero)
-@eval Intrinsics begin
-    @noinline function divi(x::T, y::T, s::Signedness) where {T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y)
-    end
-end
+@intrinsic divi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic divi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingZero)
 end
 
 # cuda_tile.fldi (floor division, toward negative infinity)
-@eval Intrinsics begin
-    @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T))
-end
+@intrinsic fldi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic fldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.fldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingNegativeInf)
 end
 
 # cuda_tile.maxi
-@eval Intrinsics begin
-    @noinline function maxi(x::T, y::T, s::Signedness) where {T<:Integer}
-        lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-        ifelse(lt, y, x)
-    end
-    @noinline maxi(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} =
-        Tile{T, S}()
-end
+@intrinsic maxi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic maxi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_MaxIOp!; signedness)
 end
 
 # cuda_tile.mini
-@eval Intrinsics begin
-    @noinline function mini(x::T, y::T, s::Signedness) where {T<:Integer}
-        lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y)
-        ifelse(lt, x, y)
-    end
-    @noinline mini(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} =
-        Tile{T, S}()
-end
+@intrinsic mini(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic mini(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.mini), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness"))
     emit_binop!(ctx, args, encode_MinIOp!; signedness)
 end
 
 # cuda_tile.muli
-@eval Intrinsics begin
-    @noinline muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
-    @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
-end
+@intrinsic muli(x::T, y::T) where {T<:Integer}
+@intrinsic muli(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args)
     emit_binop!(ctx, args, encode_MulIOp!)
 end
 
 # cuda_tile.mulhii
-@eval Intrinsics begin
-    """High bits of integer multiply (for extended precision arithmetic). Compiled to cuda_tile.mulhii."""
-    @noinline function mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
-        ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T
-    end
-    @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = Tile{T, S}()
-end
+@intrinsic mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic mulhii(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.mulhii), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args)
     emit_binop!(ctx, args, encode_MulhiIOp!)
 end
 
 # cuda_tile.negi
-@eval Intrinsics begin
-    @noinline negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x)
-    @noinline negi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a)
-end
+@intrinsic negi(x::T) where {T<:Integer}
+@intrinsic negi(a::Tile{<:Integer})
+tfunc(𝕃, ::typeof(Intrinsics.negi), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args)
     emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone)
 end
 
 # cuda_tile.remi
-@eval Intrinsics begin
-    @noinline function remi(x::T, y::T, s::Signedness) where {T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y)
-    end
-end
+@intrinsic remi(x::T, y::T, s::Signedness) where {T<:Integer}
+@intrinsic remi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.remi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness"))
     emit_binop!(ctx, args, encode_RemIOp!; signedness)
 end
 
 # cuda_tile.shli
-@eval Intrinsics begin
-    @noinline shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T)
-end
+@intrinsic shli(x::T, y::Integer) where {T<:Integer}
+@intrinsic shli(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args)
     emit_binop!(ctx, args, encode_ShLIOp!)
 end
 
 # cuda_tile.shri
-@eval Intrinsics begin
-    @noinline function shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T)
-    end
-end
+@intrinsic shri(x::T, y::Integer, s::Signedness) where {T<:Integer}
+@intrinsic shri(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.shri), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness"))
     emit_binop!(ctx, args, encode_ShRIOp!; signedness)
 end
 
 # cuda_tile.subi
-@eval Intrinsics begin
-    @noinline subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
-    @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
-end
+@intrinsic subi(x::T, y::T) where {T<:Integer}
+@intrinsic subi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args)
     emit_binop!(ctx, args, encode_SubIOp!)
 end
@@ -276,42 +240,31 @@ end
 ## Floating-point arithmetic
 
 # cuda_tile.absf
-@eval Intrinsics begin
-    @noinline absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x)
-    @noinline absf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a)
-end
+@intrinsic absf(x::T) where {T<:AbstractFloat}
+@intrinsic absf(a::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.absf), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args)
     emit_unop!(ctx, args, encode_AbsFOp!)
 end
 
 # cuda_tile.addf
-@eval Intrinsics begin
-    @noinline addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y)
-    @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic addf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic addf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.addf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args)
     emit_binop!(ctx, args, encode_AddFOp!)
 end
 
 # cuda_tile.cmpf
-@eval Intrinsics begin
-    @noinline function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
-        if pred === CmpLessThan
-            Core.Intrinsics.lt_float(x, y)
-        elseif pred === CmpLessThanOrEqual
-            Core.Intrinsics.le_float(x, y)
-        elseif pred === CmpGreaterThan
-            Core.Intrinsics.lt_float(y, x)
-        elseif pred === CmpGreaterThanOrEqual
-            Core.Intrinsics.le_float(y, x)
-        elseif pred === CmpEqual
-            Core.Intrinsics.eq_float(x, y)
-        else  # CmpNotEqual
-            Core.Intrinsics.ne_float(x, y)
-        end
+@intrinsic cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat}
+@intrinsic cmpf(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate) where {T<:AbstractFloat}
+function tfunc(𝕃, ::typeof(Intrinsics.cmpf), @nospecialize(x), @nospecialize(y), @nospecialize(pred))
+    t = CC.widenconst(x)
+    if t <: Tile
+        S = t.parameters[2]
+        return Tile{Bool, S}
     end
-    @noinline cmpf(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate) where {T<:AbstractFloat, S} =
-        Tile{Bool, S}()
+    return Bool
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
     cb = ctx.cb
@@ -336,37 +289,33 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
 end
 
 # cuda_tile.divf
-@eval Intrinsics begin
-    @noinline divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y)
-    @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic divf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic divf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.divf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args)
     emit_binop!(ctx, args, encode_DivFOp!)
 end
 
 # cuda_tile.mulf
-@eval Intrinsics begin
-    @noinline mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y)
-    @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic mulf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.mulf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args)
     emit_binop!(ctx, args, encode_MulFOp!)
 end
 
 # cuda_tile.negf
-@eval Intrinsics begin
-    @noinline negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x)
-    @noinline negf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a)
-end
+@intrinsic negf(x::T) where {T<:AbstractFloat}
+@intrinsic negf(a::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.negf), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args)
     emit_unop!(ctx, args, encode_NegFOp!)
 end
 
 # cuda_tile.subf
-@eval Intrinsics begin
-    @noinline subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y)
-    @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic subf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic subf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.subf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args)
     emit_binop!(ctx, args, encode_SubFOp!)
 end
@@ -375,10 +324,15 @@ end
 ## Boolean arithmetic
 
 # cuda_tile.andi
-@eval Intrinsics begin
-    @noinline andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
-    """Element-wise logical AND for boolean tiles."""
-    @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
+@intrinsic andi(x::T, y::T) where {T<:Integer}
+@intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y))
+    if isa(x, CC.Const) && x.val === false && CC.widenconst(y) === Bool
+        return CC.Const(false)
+    elseif isa(y, CC.Const) && y.val === false && CC.widenconst(x) === Bool
+        return CC.Const(false)
+    end
+    return CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
     cb = ctx.cb
@@ -396,10 +350,15 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
 end
 
 # cuda_tile.ori
-@eval Intrinsics begin
-    @noinline ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y)
-    """Element-wise logical OR for boolean tiles."""
-    @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
+@intrinsic ori(x::T, y::T) where {T<:Integer}
+@intrinsic ori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y))
+    if isa(x, CC.Const) && x.val === true && CC.widenconst(y) === Bool
+        return CC.Const(true)
+    elseif isa(y, CC.Const) && y.val === true && CC.widenconst(x) === Bool
+        return CC.Const(true)
+    end
+    return CC.widenconst(x)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
     cb = ctx.cb
@@ -417,11 +376,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
 end
 
 # cuda_tile.xori
-@eval Intrinsics begin
-    @noinline xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y)
-    """Element-wise logical XOR for boolean tiles."""
-    @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
-end
+@intrinsic xori(x::T, y::T) where {T<:Integer}
+@intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer}
+tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index 3c89bd4..9c480bf 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -31,20 +31,11 @@ function memory_scope_to_scope(scope::Int)
 end
 
 # cuda_tile.atomic_cas_tko
-@eval Intrinsics begin
-    """
-        atomic_cas(array, index, expected, desired, memory_order, memory_scope)
-
-    Atomic compare-and-swap at 0-indexed position.
-    Returns the original value.
-    Compiled to cuda_tile.atomic_cas_tko.
-    """
-    @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired,
-                                   memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete()
-        compilerbarrier(:const, zero(T))::T
-    end
-end
+@intrinsic atomic_cas(array, index, expected, desired,
+                      memory_order, memory_scope)
+tfunc(𝕃, ::typeof(Intrinsics.atomic_cas), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array))
+efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -169,39 +160,20 @@ function emit_atomic_rmw!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode)
 end
 
 # cuda_tile.atomic_rmw_tko with XCHG
-@eval Intrinsics begin
-    """
-        atomic_xchg(array, index, val, memory_order, memory_scope)
-
-    Atomic exchange at 0-indexed position.
-    Returns the original value.
-    Compiled to cuda_tile.atomic_rmw_tko with XCHG.
-    """
-    @noinline function atomic_xchg(array::TileArray{T, N}, index, val,
-                                    memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete()
-        compilerbarrier(:const, zero(T))
-    end
-end
+@intrinsic atomic_xchg(array, index, val, memory_order, memory_scope)
+tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array))
+efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args)
     emit_atomic_rmw!(ctx, args, AtomicXCHG)
 end
 
 # cuda_tile.atomic_rmw_tko with ADD
-@eval Intrinsics begin
-    """
-        atomic_add(array, index, val, memory_order, memory_scope)
-
-    Atomic addition at 0-indexed position.
-    Returns the original value.
-    Compiled to cuda_tile.atomic_rmw_tko with ADD.
-    """
-    @noinline function atomic_add(array::TileArray{T, N}, index, val,
-                                   memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete()
-        compilerbarrier(:const, zero(T))
-    end
-end
+@intrinsic atomic_add(array, index, val,
+                      memory_order, memory_scope)
+tfunc(𝕃, ::typeof(Intrinsics.atomic_add), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array))
+efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args)
     emit_atomic_rmw!(ctx, args, AtomicADD)
 end
diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 6c33afc..e302063 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -3,10 +3,12 @@
 # TODO: cuda_tile.bitcast
 
 # cuda_tile.exti (scalar integer extension)
-@eval Intrinsics begin
-    @noinline function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x)
-    end
+@intrinsic exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args)
     cb = ctx.cb
@@ -26,10 +28,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args)
 end
 
 # cuda_tile.ftof (scalar float to float)
-@eval Intrinsics begin
-    @noinline function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
-        sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x)
-    end
+@intrinsic ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat}
+function tfunc(𝕃, ::typeof(Intrinsics.ftof), @nospecialize(x), @nospecialize(target_type))
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args)
     cb = ctx.cb
@@ -48,10 +52,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args)
 end
 
 # cuda_tile.ftoi (scalar float to integer)
-@eval Intrinsics begin
-    @noinline function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
-        s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x)
-    end
+@intrinsic ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.ftoi), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args)
     cb = ctx.cb
@@ -71,10 +77,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args)
 end
 
 # cuda_tile.itof (scalar integer to float)
-@eval Intrinsics begin
-    @noinline function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
-        s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x)
-    end
+@intrinsic itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat}
+function tfunc(𝕃, ::typeof(Intrinsics.itof), @nospecialize(x), @nospecialize(target_type), @nospecialize(s))
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args)
     cb = ctx.cb
@@ -94,8 +102,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args)
 end
 
 # cuda_tile.trunci (scalar integer truncation)
-@eval Intrinsics begin
-    @noinline trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x)
+@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer}
+function tfunc(𝕃, ::typeof(Intrinsics.trunci), @nospecialize(x), @nospecialize(target_type))
+    T = instanceof_tfunc(target_type)
+    T === nothing && return nothing
+    src = CC.widenconst(x)
+    src <: Tile ? similar_type(src, T) : T
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.trunci), args)
     cb = ctx.cb
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index b64fbcf..306d13a 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -19,22 +19,11 @@ function validate_tile_shape(shape, context::String)
 end
 
 # cuda_tile.broadcast
-@eval Intrinsics begin
-    """
-        broadcast(tile, shape_val)
-
-    Explicitly broadcast a tile to a target shape.
-    Compiled to cuda_tile.broadcast.
-    """
-    @noinline function broadcast(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+@intrinsic broadcast(tile, shape)
+function tfunc(𝕃, ::typeof(Intrinsics.broadcast), @nospecialize(tile), @nospecialize(shape_arg))
+    tile_type = CC.widenconst(tile)
     tile_type <: Tile || return nothing
-    shape_arg = argtypes[3]
+    shape_arg = shape_arg
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tile_type)
@@ -109,22 +98,10 @@ function broadcast_tile_to_shape!(cb::CodeBuilder, tt::TypeTable, tv::CGVal,
 end
 
 # cuda_tile.cat
-@eval Intrinsics begin
-    """
-        cat(tiles, axis_val)
-
-    Concatenate two tiles along 0-indexed axis.
-    Compiled to cuda_tile.cat.
-    """
-    @noinline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Integer) where {T, S1, S2}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tuple_type = CC.widenconst(argtypes[2])
+@intrinsic cat(tiles, axis)
+function tfunc(𝕃, ::typeof(Intrinsics.cat), @nospecialize(tiles), @nospecialize(axis_arg))
+    tuple_type = CC.widenconst(tiles)
     tuple_type <: Tuple{Tile, Tile} || return nothing
-    axis_arg = argtypes[3]
     isa(axis_arg, CC.Const) || return nothing
     axis = axis_arg.val
     t1_type = tuple_type.parameters[1]
@@ -186,25 +163,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args)
 end
 
 # cuda_tile.constant
-@eval Intrinsics begin
-    """
-        constant(shape, value, T)
-
-    Create a tile filled with a constant value.
-    Compiled to cuda_tile.constant.
-    """
-    @noinline function constant(shape::NTuple{N, Int}, value, ::Type{T}) where {N, T}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any})
-    length(argtypes) >= 4 || return nothing
-    shape_arg = argtypes[2]
+@intrinsic constant(shape, value, T)
+function tfunc(𝕃, ::typeof(Intrinsics.constant), @nospecialize(shape_arg), @nospecialize(value), @nospecialize(type_arg_lat))
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
-    type_arg = CC.widenconst(argtypes[4])
-    type_arg <: Type || return nothing
-    T = type_arg.parameters[1]
+    T = instanceof_tfunc(type_arg_lat)
+    T === nothing && return nothing
     return Tile{T, Tuple{shape...}}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.constant), args)
@@ -236,22 +200,10 @@ end
 # TODO: cuda_tile.entry
 
 # cuda_tile.extract
-@eval Intrinsics begin
-    """
-        extract(tile, index_val, shape_val)
-
-    Extract a sub-tile from tile at 0-indexed slice indices.
-    Compiled to cuda_tile.extract.
-    """
-    @noinline function extract(tile::Tile{T}, index::NTuple{N, Int}, shape::NTuple{N, Int}) where {T, N}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any})
-    length(argtypes) >= 4 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+@intrinsic extract(tile, index, shape)
+function tfunc(𝕃, ::typeof(Intrinsics.extract), @nospecialize(tile_lat), @nospecialize(index), @nospecialize(shape_arg))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
-    shape_arg = argtypes[4]
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tile_type)
@@ -300,15 +252,8 @@ end
 # TODO: cuda_tile.get_global
 
 # cuda_tile.get_num_tile_blocks
-@eval Intrinsics begin
-    """
-        get_num_tile_blocks(axis)::Int32
-
-    Get the grid size along the given axis (0=x, 1=y, 2=z).
-    Compiled to cuda_tile.get_num_tile_blocks.
-    """
-    @noinline get_num_tile_blocks(axis::Integer) = compilerbarrier(:const, zero(Int32))
-end
+@intrinsic get_num_tile_blocks(axis)
+tfunc(𝕃, ::typeof(Intrinsics.get_num_tile_blocks), @nospecialize(axis)) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), args)
     axis = @something get_constant(ctx, args[1]) throw(IRError("get_num_tile_blocks() axis must be a compile-time constant"))
     axis in (0, 1, 2) || throw(IRError("get_num_tile_blocks() axis must be 0, 1, or 2, got $axis"))
@@ -320,15 +265,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), a
 end
 
 # cuda_tile.get_tile_block_id
-@eval Intrinsics begin
-    """
-        get_tile_block_id(axis)::Int32
-
-    Get the block ID along the given axis (0=x, 1=y, 2=z).
-    Compiled to cuda_tile.get_tile_block_id.
-    """
-    @noinline get_tile_block_id(axis::Integer) = compilerbarrier(:const, zero(Int32))
-end
+@intrinsic get_tile_block_id(axis)
+tfunc(𝕃, ::typeof(Intrinsics.get_tile_block_id), @nospecialize(axis)) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_tile_block_id), args)
     axis = @something get_constant(ctx, args[1]) throw(IRError("get_tile_block_id() axis must be a compile-time constant"))
     axis in (0, 1, 2) || throw(IRError("get_tile_block_id() axis must be 0, 1, or 2, got $axis"))
@@ -343,25 +281,12 @@ end
 # TODO: cuda_tile.global
 
 # cuda_tile.iota
-@eval Intrinsics begin
-    """
-        iota(shape, T)
-
-    Create a 1D tile with values [0, 1, 2, ..., shape[1]-1] (0-indexed).
-    Compiled to cuda_tile.iota.
-    """
-    @noinline function iota(shape::NTuple{1, Int}, ::Type{T}) where {T}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    shape_arg = argtypes[2]
+@intrinsic iota(shape, T)
+function tfunc(𝕃, ::typeof(Intrinsics.iota), @nospecialize(shape_arg), @nospecialize(type_arg_lat))
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
-    type_arg = CC.widenconst(argtypes[3])
-    type_arg <: Type || return nothing
-    T = type_arg.parameters[1]
+    T = instanceof_tfunc(type_arg_lat)
+    T === nothing && return nothing
     return Tile{T, Tuple{shape...}}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args)
@@ -387,17 +312,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args)
 end
 
 # cuda_tile.mmaf, cuda_tile.mmai
-@eval Intrinsics begin
-    """
-        mma(a, b, acc)
-
-    Matrix-multiply-accumulate: result = a @ b + acc.
-    Compiled to cuda_tile.mmaf or cuda_tile.mmai.
-    """
-    @noinline function mma(a::Tile{T1}, b::Tile{T2}, acc::Tile{T3, SC}) where {T1, T2, T3, SC}
-        Tile{T3, SC}()
-    end
-end
+@intrinsic mma(a::Tile, b::Tile, acc::Tile)
+tfunc(𝕃, ::typeof(Intrinsics.mma), @nospecialize(a), @nospecialize(b), @nospecialize(acc)) = CC.widenconst(acc)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args)
     cb = ctx.cb
 
@@ -415,16 +331,15 @@ end
 # TODO: cuda_tile.module
 
 # cuda_tile.offset
-@eval Intrinsics begin
-    """
-        offset(base, offsets)
-
-    Compute base_ptr + offsets for each element of offsets tile (element-scaled).
-    Returns a tile of pointers. Compiled to cuda_tile.offset.
-    """
-    @noinline function offset(base::Ptr{T}, offsets::Tile{I, S}) where {T, I <: Integer, S}
-        Tile{Ptr{T}, S}()
-    end
+@intrinsic offset(base, offsets)
+function tfunc(𝕃, ::typeof(Intrinsics.offset), @nospecialize(base), @nospecialize(offsets))
+    base_type = CC.widenconst(base)
+    base_type <: Ptr || return nothing
+    offsets_type = CC.widenconst(offsets)
+    offsets_type <: Tile || return nothing
+    T = eltype(base_type)
+    S = offsets_type.parameters[2]
+    return Tile{Ptr{T}, S}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.offset), args)
     cb = ctx.cb
@@ -469,22 +384,10 @@ end
 # TODO: cudatile.pack
 
 # cuda_tile.permute
-@eval Intrinsics begin
-    """
-        permute(tile, perm_val)
-
-    Permute tile dimensions according to 0-indexed permutation.
-    Compiled to cuda_tile.permute.
-    """
-    @noinline function permute(tile::Tile{T, S}, perm::NTuple{N, Int}) where {T, S, N}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+@intrinsic permute(tile, perm)
+function tfunc(𝕃, ::typeof(Intrinsics.permute), @nospecialize(tile_lat), @nospecialize(perm_arg))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
-    perm_arg = argtypes[3]
     isa(perm_arg, CC.Const) || return nothing
     perm = perm_arg.val
     s = size(tile_type)
@@ -529,20 +432,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args)
 end
 
 # cuda_tile.transpose
-@eval Intrinsics begin
-    """
-        transpose(tile)
-
-    Transpose a 2D tile, swapping its dimensions.
-    Compiled to cuda_tile.permute with perm=(1, 0).
-    """
-    @noinline function transpose(tile::Tile{T}) where {T}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+@intrinsic transpose(tile)
+function tfunc(𝕃, ::typeof(Intrinsics.transpose), @nospecialize(tile_lat))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
     s = size(tile_type)
     isempty(s) && return nothing
@@ -576,29 +468,10 @@ end
 
 
 # cuda_tile.reduce
-@eval Intrinsics begin
-    """
-        reduce(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple) -> Tuple{Tile...}
-
-    Reduce tiles along a 0-indexed axis using combiner `f` with per-operand
-    identity values. Accepts and returns tuples of tiles; single-operand
-    callers wrap in 1-tuples and unwrap with `[1]`.
-    Compiled to cuda_tile.reduce.
-    """
-    @noinline function reduce(tiles::Tuple{Tile{T, S}}, axis::Integer, f,
-                              identities::Tuple{Any}) where {T, S}
-        compilerbarrier(:type, nothing)
-    end
-    @noinline function reduce(tiles::Tuple{Tile{T1, S}, Tile{T2, S}}, axis::Integer, f,
-                              identities::Tuple{Any, Any}) where {T1, T2, S}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tuple_type = CC.widenconst(argtypes[2])
+@intrinsic reduce(tiles, axis, f, identities)
+function tfunc(𝕃, ::typeof(Intrinsics.reduce), @nospecialize(tiles), @nospecialize(axis_arg), @nospecialize args...)
+    tuple_type = CC.widenconst(tiles)
     tuple_type isa DataType && tuple_type <: Tuple || return nothing
-    axis_arg = argtypes[3]
     isa(axis_arg, CC.Const) || return nothing
     axis = axis_arg.val
     result_params = Any[]
@@ -724,22 +597,10 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityVal(to_uint128(T(val)), dtype, T)
 
 # cuda_tile.reshape
-@eval Intrinsics begin
-    """
-        reshape(tile, shape_val)
-
-    Reshape a tile to a new shape (same total elements).
-    Compiled to cuda_tile.reshape.
-    """
-    @noinline function reshape(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+@intrinsic reshape(tile, shape)
+function tfunc(𝕃, ::typeof(Intrinsics.reshape), @nospecialize(tile_lat), @nospecialize(shape_arg))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
-    shape_arg = argtypes[3]
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tile_type)
@@ -803,24 +664,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args)
 end
 
 # cuda_tile.scan
-@eval Intrinsics begin
-    """
-        scan(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple, reverse=false) -> Tuple{Tile...}
-
-    Parallel prefix scan along a 0-indexed axis using combiner `f` with
-    per-operand identity values. Accepts and returns tuples of tiles;
-    single-operand callers wrap in 1-tuples and unwrap with `[1]`.
-    `reverse=true` for a reverse (suffix) scan.
-    Compiled to cuda_tile.scan.
-    """
-    @noinline function scan(tiles::Tuple{Tile{T, S}}, axis::Integer, f,
-                            identities::Tuple{Any}, reverse::Bool=false) where {T, S}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    tuple_type = CC.widenconst(argtypes[2])
+@intrinsic scan(tiles, axis, f, identities, reverse=false)
+function tfunc(𝕃, ::typeof(Intrinsics.scan), @nospecialize(tiles), @nospecialize args...)
+    tuple_type = CC.widenconst(tiles)
     tuple_type isa DataType && tuple_type <: Tuple || return nothing
     result_params = Any[]
     for p in tuple_type.parameters
@@ -916,17 +762,19 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
 end
 
 # cuda_tile.select
-@eval Intrinsics begin
-    """
-        select(cond, x, y)
-
-    Element-wise conditional selection.
-    Compiled to cuda_tile.select.
-    """
-    @noinline select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y)
-    @noinline function select(cond::Tile{Bool, S}, x::Tile{T, S}, y::Tile{T, S}) where {T, S}
-        Tile{T, S}()
+@intrinsic select(cond::Bool, x::T, y::T) where {T}# = Core.ifelse(cond, x, y)
+@intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T}
+function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y))
+    if isa(cond, CC.Const)
+        if cond.val === true
+            return x
+        elseif cond.val === false
+            return y
+        else
+            return Union{}
+        end
     end
+    return CC.tmerge(𝕃, x, y)
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args)
     cb = ctx.cb
@@ -947,21 +795,16 @@ end
 # These are codegen-only reinterpret intrinsics for map(f, tile).
 # to_scalar: jltype becomes scalar T (for overlay dispatch), but IR value stays shaped.
 # from_scalar: restores jltype to Tile{T, S}.
-@eval Intrinsics begin
-    @noinline to_scalar(tile::Tile{T, S}) where {T, S} = compilerbarrier(:type, nothing)
-    @noinline from_scalar(x::T, ::Type{S}) where {T, S} = Tile{T, S}()
-end
-function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    T = CC.widenconst(argtypes[2])
-    shape_type = CC.widenconst(argtypes[3])
-    shape_type <: Type || return nothing
-    S = shape_type.parameters[1]
+@intrinsic to_scalar(tile)
+@intrinsic from_scalar(x, S)
+function tfunc(𝕃, ::typeof(Intrinsics.from_scalar), @nospecialize(x), @nospecialize(S_lat))
+    T = CC.widenconst(x)
+    S = instanceof_tfunc(S_lat)
+    S === nothing && return nothing
     return Tile{T, S}
 end
-function tfunc(::typeof(Intrinsics.to_scalar), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    tile_type = CC.widenconst(argtypes[2])
+function tfunc(𝕃, ::typeof(Intrinsics.to_scalar), @nospecialize(tile_lat))
+    tile_type = CC.widenconst(tile_lat)
     tile_type <: Tile || return nothing
     return eltype(tile_type)
 end
diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl
index ded13df..519cf1e 100644
--- a/src/compiler/intrinsics/math.jl
+++ b/src/compiler/intrinsics/math.jl
@@ -3,41 +3,33 @@
 ## Floating-point math
 
 # cuda_tile.ceil
-@eval Intrinsics begin
-    """Ceiling (round toward positive infinity). Compiled to cuda_tile.ceil."""
-    @noinline ceil(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline ceil(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic ceil(x::AbstractFloat)
+@intrinsic ceil(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.ceil), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args)
     emit_unop!(ctx, args, encode_CeilOp!)
 end
 
 # cuda_tile.cos
-@eval Intrinsics begin
-    """Cosine. Compiled to cuda_tile.cos."""
-    @noinline cos(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline cos(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic cos(x::AbstractFloat)
+@intrinsic cos(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.cos), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args)
     emit_unop!(ctx, args, encode_CosOp!)
 end
 
 # cuda_tile.cosh
-@eval Intrinsics begin
-    """Hyperbolic cosine. Compiled to cuda_tile.cosh."""
-    @noinline cosh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline cosh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic cosh(x::AbstractFloat)
+@intrinsic cosh(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.cosh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args)
     emit_unop!(ctx, args, encode_CosHOp!)
 end
 
 # cuda_tile.exp2
-@eval Intrinsics begin
-    """Base-2 exponential (2^x). Compiled to cuda_tile.exp2."""
-    @noinline exp2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline exp2(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic exp2(x::AbstractFloat, flush_to_zero::Bool=false)
+@intrinsic exp2(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false)
+tfunc(𝕃, ::typeof(Intrinsics.exp2), @nospecialize(x), @nospecialize args...) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
     cb = ctx.cb
 
@@ -52,11 +44,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args)
 end
 
 # cuda_tile.exp
-@eval Intrinsics begin
-    """Natural exponential (e^x). Compiled to cuda_tile.exp."""
-    @noinline exp(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline exp(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic exp(x::AbstractFloat)
+@intrinsic exp(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.exp), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
     cb = ctx.cb
 
@@ -69,21 +59,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args)
 end
 
 # cuda_tile.floor
-@eval Intrinsics begin
-    """Floor (round toward negative infinity). Compiled to cuda_tile.floor."""
-    @noinline floor(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline floor(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic floor(x::AbstractFloat)
+@intrinsic floor(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.floor), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args)
     emit_unop!(ctx, args, encode_FloorOp!)
 end
 
 # cuda_tile.fma
-@eval Intrinsics begin
-    """Fused multiply-add: a * b + c. Compiled to cuda_tile.fma."""
-    @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic fma(x::T, y::T, z::T) where {T<:AbstractFloat}
+@intrinsic fma(x::Tile{T}, y::Tile{T}, z::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.fma), @nospecialize(x), @nospecialize(y), @nospecialize(z)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
     cb = ctx.cb
 
@@ -99,11 +85,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
 end
 
 # cuda_tile.log2
-@eval Intrinsics begin
-    """Base-2 logarithm. Compiled to cuda_tile.log2."""
-    @noinline log2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline log2(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic log2(x::AbstractFloat)
+@intrinsic log2(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.log2), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
     cb = ctx.cb
 
@@ -116,11 +100,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args)
 end
 
 # cuda_tile.log
-@eval Intrinsics begin
-    """Element-wise natural logarithm. Compiled to cuda_tile.log."""
-    @noinline log(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline log(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic log(x::AbstractFloat)
+@intrinsic log(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.log), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
     cb = ctx.cb
 
@@ -133,49 +115,41 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args)
 end
 
 # cuda_tile.maxf
-@eval Intrinsics begin
-    @noinline maxf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x > y || isnan(x), x, y)
-    @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic maxf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic maxf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.maxf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args)
     emit_binop!(ctx, args, encode_MaxFOp!)
 end
 
 # cuda_tile.minf
-@eval Intrinsics begin
-    @noinline minf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x < y || isnan(x), x, y)
-    @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic minf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic minf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.minf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args)
     emit_binop!(ctx, args, encode_MinFOp!)
 end
 
 # cuda_tile.pow
-@eval Intrinsics begin
-    """Element-wise power. Compiled to cuda_tile.pow."""
-    @noinline pow(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic pow(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic pow(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.pow), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args)
     emit_binop!(ctx, args, encode_PowOp!)
 end
 
 # cuda_tile.remf
-@eval Intrinsics begin
-    """Element-wise floating-point remainder. Compiled to cuda_tile.remf."""
-    @noinline remf(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
-end
+@intrinsic remf(x::T, y::T) where {T<:AbstractFloat}
+@intrinsic remf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat}
+tfunc(𝕃, ::typeof(Intrinsics.remf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args)
     emit_binop!(ctx, args, encode_RemFOp!)
 end
 
 # cuda_tile.rsqrt
-@eval Intrinsics begin
-    """Element-wise reciprocal square root. Compiled to cuda_tile.rsqrt."""
-    @noinline rsqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline rsqrt(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic rsqrt(x::AbstractFloat, flush_to_zero::Bool=false)
+@intrinsic rsqrt(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false)
+tfunc(𝕃, ::typeof(Intrinsics.rsqrt), @nospecialize(x), @nospecialize args...) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
     cb = ctx.cb
 
@@ -190,31 +164,25 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args)
 end
 
 # cuda_tile.sin
-@eval Intrinsics begin
-    """Element-wise sine. Compiled to cuda_tile.sin."""
-    @noinline sin(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline sin(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic sin(x::AbstractFloat)
+@intrinsic sin(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.sin), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args)
     emit_unop!(ctx, args, encode_SinOp!)
 end
 
 # cuda_tile.sinh
-@eval Intrinsics begin
-    """Element-wise hyperbolic sine. Compiled to cuda_tile.sinh."""
-    @noinline sinh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline sinh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic sinh(x::AbstractFloat)
+@intrinsic sinh(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.sinh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args)
     emit_unop!(ctx, args, encode_SinHOp!)
 end
 
 # cuda_tile.sqrt
-@eval Intrinsics begin
-    """Element-wise square root. Compiled to cuda_tile.sqrt."""
-    @noinline sqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline sqrt(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic sqrt(x::AbstractFloat)
+@intrinsic sqrt(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.sqrt), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
     cb = ctx.cb
 
@@ -227,21 +195,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args)
 end
 
 # cuda_tile.tan
-@eval Intrinsics begin
-    """Element-wise tangent. Compiled to cuda_tile.tan."""
-    @noinline tan(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline tan(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic tan(x::AbstractFloat)
+@intrinsic tan(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.tan), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args)
     emit_unop!(ctx, args, encode_TanOp!)
 end
 
 # cuda_tile.tanh
-@eval Intrinsics begin
-    """Element-wise hyperbolic tangent. Compiled to cuda_tile.tanh."""
-    @noinline tanh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline tanh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile)
-end
+@intrinsic tanh(x::AbstractFloat)
+@intrinsic tanh(x::Tile{<:AbstractFloat})
+tfunc(𝕃, ::typeof(Intrinsics.tanh), @nospecialize(x)) = CC.widenconst(x)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args)
     emit_unop!(ctx, args, encode_TanHOp!)
 end
diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl
index 1d42ad5..d4d4f87 100644
--- a/src/compiler/intrinsics/memory.jl
+++ b/src/compiler/intrinsics/memory.jl
@@ -3,23 +3,15 @@
 # TODO: cuda_tile.join_tokens
 
 # cuda_tile.load_ptr_tko
-@eval Intrinsics begin
-    """
-        load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing)
-
-    Load values from a tile of pointers.
-    If mask is provided, masked-out positions return the padding value.
-    Compiled to cuda_tile.load_ptr_tko.
-
-    Note: TMA (allow_tma) is not applicable for pointer-based loads as they
-    support irregular access patterns incompatible with TMA requirements.
-    """
-    @noinline function load_ptr_tko(ptrs::Tile{Ptr{T}, S},
-                                     latency::Union{Int, Nothing}=nothing,
-                                     mask::Union{Tile{Bool, S}, Nothing}=nothing,
-                                     padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S}
-        Tile{T, S}()
-    end
+@intrinsic load_ptr_tko(ptrs, latency=nothing, mask=nothing, padding=nothing)
+function tfunc(𝕃, ::typeof(Intrinsics.load_ptr_tko), @nospecialize(ptrs), @nospecialize args...)
+    ptrs_type = CC.widenconst(ptrs)
+    ptrs_type <: Tile || return nothing
+    ptr_type = eltype(ptrs_type)
+    ptr_type <: Ptr || return nothing
+    T = eltype(ptr_type)
+    S = ptrs_type.parameters[2]
+    return Tile{T, S}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args)
     cb = ctx.cb
@@ -81,24 +73,12 @@ end
 # TODO: cuda_tile.make_token
 
 # cuda_tile.store_ptr_tko
-@eval Intrinsics begin
-    """
-        store_ptr_tko(ptrs, values, latency, mask=nothing)
-
-    Store values to a tile of pointers.
-    If mask is provided, masked-out positions are not written.
-    Compiled to cuda_tile.store_ptr_tko.
-
-    Note: TMA (allow_tma) is not applicable for pointer-based stores as they
-    support irregular access patterns incompatible with TMA requirements.
-    """
-    @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
-                                      latency::Union{Int, Nothing},
-                                      mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
-        donotdelete()
-        nothing
-    end
-end
+@intrinsic store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
+                                   latency::Union{Int, Nothing},
+                                   mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
+tfunc(𝕃, ::typeof(Intrinsics.store_ptr_tko), @nospecialize args...) = Nothing
+efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl
index 0b9f332..fa1c4ba 100644
--- a/src/compiler/intrinsics/misc.jl
+++ b/src/compiler/intrinsics/misc.jl
@@ -1,12 +1,10 @@
 # miscellaneous intrinsics
 
 # cuda_tile.assert
-@eval Intrinsics begin
-    @noinline function assert(cond::Bool, message::String)
-        donotdelete(cond, message)
-        nothing
-    end
-end
+@intrinsic assert(cond::Bool, message::String)
+tfunc(𝕃, ::typeof(Intrinsics.assert), @nospecialize(cond), @nospecialize(message)) = Nothing
+efunc(::typeof(Intrinsics.assert), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.assert), args)
     cond = @something emit_value!(ctx, args[1]) throw(IRError("assert: cannot resolve condition"))
     message = @something get_constant(ctx, args[2]) throw(IRError("assert: requires constant message"))
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index c8f1a88..fff19b1 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -24,17 +24,8 @@ function get_padding_value(ctx::CGCtx, args)
 end
 
 # cuda_tile.get_index_space_shape
-@eval Intrinsics begin
-    """
-        get_index_space_shape(pv::PartitionView, axis) -> Int32
-
-    Get the number of tiles along the given axis (0-indexed).
-    Compiled to cuda_tile.get_index_space_shape.
-    """
-    @noinline function get_index_space_shape(pv::PartitionView{T, N, Shape}, axis::Integer) where {T, N, Shape}
-        compilerbarrier(:const, zero(Int32))
-    end
-end
+@intrinsic get_index_space_shape(pv, axis)
+tfunc(𝕃, ::typeof(Intrinsics.get_index_space_shape), @nospecialize(pv), @nospecialize(axis)) = Int32
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_index_space_shape), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -69,23 +60,9 @@ end
 # TODO: cuda_tile.get_tensor_shape
 
 # cuda_tile.load_view_tko
-@eval Intrinsics begin
-    """
-        load_partition_view(pv::PartitionView, latency, allow_tma, index...) -> Tile
-
-    Load a tile from a partition view at the given 0-indexed tile coordinates.
-    Compiled to cuda_tile.load_view_tko.
-    """
-    @noinline function load_partition_view(pv::PartitionView{T, N, Shape},
-                                            latency::Union{Int, Nothing},
-                                            allow_tma::Bool,
-                                            indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any})
-    length(argtypes) >= 2 || return nothing
-    pv_type = CC.widenconst(argtypes[2])
+@intrinsic load_partition_view(pv, latency, allow_tma, indices)
+function tfunc(𝕃, ::typeof(Intrinsics.load_partition_view), @nospecialize(pv), @nospecialize args...)
+    pv_type = CC.widenconst(pv)
     pv_type <: PartitionView || return nothing
     pv_type isa DataType || return nothing
     length(pv_type.parameters) >= 3 || return nothing
@@ -172,24 +149,10 @@ function pad_indices(ctx::CGCtx, index_vals::Vector{Value}, ndim::Int, idx_type:
 end
 
 # cuda_tile.make_partition_view
-@eval Intrinsics begin
-    """
-        make_partition_view(tv::TensorView, shape_val, padding_mode, order) -> PartitionView
-
-    Create a PartitionView from a TensorView with the given tile shape.
-    The `order` parameter (NTuple{N,Int} or nothing) specifies
-    the logical-to-physical dimension mapping (1-indexed), or identity if nothing.
-    Compiled to cuda_tile.make_partition_view.
-    """
-    @noinline function make_partition_view(tv::TensorView{T, N}, shape::NTuple{M, Int}, padding_mode::Int, order) where {T, N, M}
-        compilerbarrier(:type, nothing)
-    end
-end
-function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any})
-    length(argtypes) >= 3 || return nothing
-    tv_type = CC.widenconst(argtypes[2])
+@intrinsic make_partition_view(tv, shape, padding_mode, order)
+function tfunc(𝕃, ::typeof(Intrinsics.make_partition_view), @nospecialize(tv), @nospecialize(shape_arg), @nospecialize args...)
+    tv_type = CC.widenconst(tv)
     tv_type <: TensorView || return nothing
-    shape_arg = argtypes[3]
     isa(shape_arg, CC.Const) || return nothing
     shape = shape_arg.val
     T = eltype(tv_type)
@@ -336,16 +299,11 @@ function filter_dynamic_strides(stride_vals::Vector{Value}, tv_strides::Vector{I
 end
 
 # cuda_tile.make_tensor_view
-@eval Intrinsics begin
-    """
-        make_tensor_view(arr::TileArray) -> TensorView
-
-    Create a TensorView from a TileArray.
-    Compiled to cuda_tile.make_tensor_view.
-    """
-    @noinline function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N}
-        TensorView{T, N}()
-    end
+@intrinsic make_tensor_view(arr::TileArray{T, N}) where {T, N}
+function tfunc(𝕃, ::typeof(Intrinsics.make_tensor_view), @nospecialize(arr))
+    t = CC.widenconst(arr)
+    t <: TileArray || return nothing
+    TensorView{eltype(t), ndims(t)}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args)
     array_arg = args[1]
@@ -366,22 +324,14 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args
 end
 
 # cuda_tile.store_view_tko
-@eval Intrinsics begin
-    """
-        store_partition_view(pv::PartitionView, tile, latency, allow_tma, index...) -> Nothing
-
-    Store a tile to a partition view at the given 0-indexed tile coordinates.
-    Compiled to cuda_tile.store_view_tko.
-    """
-    @noinline function store_partition_view(pv::PartitionView{T, N, Shape},
-                                             tile::Tile{T},
-                                             latency::Union{Int, Nothing},
-                                             allow_tma::Bool,
-                                             indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        donotdelete()
-        nothing
-    end
-end
+@intrinsic store_partition_view(pv::PartitionView{T, N, Shape},
+                                          tile::Tile{T},
+                                          latency::Union{Int, Nothing},
+                                          allow_tma::Bool,
+                                          indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
+tfunc(𝕃, ::typeof(Intrinsics.store_partition_view), @nospecialize args...) = Nothing
+efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), args)
     cb = ctx.cb
     tt = ctx.tt
diff --git a/test/Project.toml b/test/Project.toml
index 278b9d8..bd30c97 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -8,9 +8,6 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-[sources]
-FileCheck = {url = "https://github.com/JuliaLLVM/FileCheck.jl", rev = "main"}
-
 [compat]
 FileCheck = "1.0"
 ParallelTestRunner = "2.0"
diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl
index 2e0cb80..a1503dc 100644
--- a/test/codegen/integration.jl
+++ b/test/codegen/integration.jl
@@ -688,6 +688,52 @@ end
                 end
             end
         end
+
+        @testset "float constant addition folds through addf" begin
+            @test @filecheck begin
+                @check_label "entry"
+                @check_not "addf"
+                @check "constant <f32: 5"
+                @check "mulf"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    scale = 2.0f0 + 3.0f0
+                    Base.donotdelete(tile .* scale)
+                    return
+                end
+            end
+        end
+
+        @testset "integer constant subtraction folds through subi" begin
+            @test @filecheck begin
+                @check_label "entry"
+                @check_not "subi"
+                @check "load_view"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a
+                    idx = Int32(5) - Int32(2)
+                    tile = ct.load(a, idx, (16,))
+                    Base.donotdelete(tile)
+                    return
+                end
+            end
+        end
+
+        @testset "float constant multiplication folds through mulf" begin
+            @test @filecheck begin
+                @check_label "entry"
+                @check "constant <f32: 6"
+                @check "broadcast"
+                @check "mulf"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    scale = 2.0f0 * 3.0f0
+                    Base.donotdelete(tile .* scale)
+                    return
+                end
+            end
+        end
     end
 end