From 0ec8b9cf7513c4c5d5167b9fafef20d83f3cb866 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 9 Feb 2026 16:18:44 +0100 Subject: [PATCH 01/17] Emit our intrinsics as :calls instead of :invoke. That avoids CIs, allowing easier spoofing of effects through efuncs. --- src/compiler/codegen/expressions.jl | 4 +-- src/compiler/interface.jl | 44 ++++++++++++++++++++--------- src/compiler/intrinsics.jl | 11 +------- src/compiler/intrinsics/atomics.jl | 9 ++++-- src/compiler/intrinsics/memory.jl | 3 +- src/compiler/intrinsics/misc.jl | 3 +- src/compiler/intrinsics/views.jl | 3 +- 7 files changed, 45 insertions(+), 32 deletions(-) diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl index 02b7c38..9abe616 100644 --- a/src/compiler/codegen/expressions.jl +++ b/src/compiler/codegen/expressions.jl @@ -79,9 +79,7 @@ function emit_call!(ctx::CGCtx, expr::Expr, @nospecialize(result_type)) func = get_constant(ctx, args[1]) call_args = args[2:end] - # TODO: This is normally dynamic dispatch, which we should allow. - # However, we currently trigger this when emitting Julia intrinsics. - # We should switch to our own intrinsics entirely, which are only invoked. + # We enter here for dynamic dispatch, but also for all intrinsic functions. @static if isdefined(Core, :throw_methoderror) if func === Core.throw_methoderror diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index bdc6e8a..ebc5607 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -74,15 +74,21 @@ CC.may_compress(::cuTileInterpreter) = true CC.may_discard_trees(::cuTileInterpreter) = false #============================================================================= - Custom return-type inference (tfuncs) for intrinsics + Custom inference for intrinsics =============================================================================# -# Per-intrinsic return type overrides using multiple dispatch. +# Per-intrinsic return type overrides. # Returns nothing when no override applies (fallback). -# Concrete per-intrinsic methods are defined in intrinsics/ (after the -# Intrinsics module exists). tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing +# Per-intrinsic effect overrides. +# Returns nothing when no override applies (fallback). +efunc(@nospecialize(f), effects::CC.Effects) = nothing + +# Predicate for functions defined in the Intrinsics module. +# These get NoCallInfo() so they stay as Expr(:call) rather than Expr(:invoke). +isintrinsic(@nospecialize(f)) = isa(f, Function) && parentmodule(f) === Intrinsics + #============================================================================= Subprogram inference for reduce/scan =============================================================================# @@ -172,9 +178,10 @@ end result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any, arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing}, sv::CC.InferenceState, max_methods::Int) + is_intr = isintrinsic(f) rt_override = tfunc(f, arginfo.argtypes) subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv) - rt_override === nothing && subprog === nothing && return result + !is_intr && rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() push!(sv.tasks, function (interp′, sv′) isready(result) || return false @@ -182,8 +189,11 @@ end cm = result[] sp = subprog !== nothing ? subprog[] : nothing rt = rt_override !== nothing ? rt_override : cm.rt - info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info - wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements) + efunc_override = is_intr ? efunc(f, cm.effects) : nothing + effects = efunc_override !== nothing ? efunc_override : cm.effects + info = is_intr ? CC.NoCallInfo() : cm.info + info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info + wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) return true end) return wrapped @@ -195,9 +205,10 @@ elseif isdefined(CC, :Future) # 1.12–1.13 result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any, arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.InferenceState, max_methods::Int) + is_intr = isintrinsic(f) rt_override = tfunc(f, arginfo.argtypes) subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv) - rt_override === nothing && subprog === nothing && return result + !is_intr && rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() push!(sv.tasks, function (interp′, sv′) isready(result) || return false @@ -205,8 +216,11 @@ elseif isdefined(CC, :Future) # 1.12–1.13 cm = result[] sp = subprog !== nothing ? subprog[] : nothing rt = rt_override !== nothing ? rt_override : cm.rt - info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info - wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements) + efunc_override = is_intr ? efunc(f, cm.effects) : nothing + effects = efunc_override !== nothing ? efunc_override : cm.effects + info = is_intr ? CC.NoCallInfo() : cm.info + info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info + wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) return true end) return wrapped @@ -219,10 +233,14 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int) _infer_subprogram(interp, f, arginfo, si, nothing, sv) # side-effect only + is_intr = isintrinsic(f) rt_override = tfunc(f, arginfo.argtypes) - if rt_override !== nothing - return CC.CallMeta(rt_override, result.exct, result.effects, - result.info) + rt = rt_override !== nothing ? rt_override : result.rt + efunc_override = is_intr ? efunc(f, result.effects) : nothing + effects = efunc_override !== nothing ? efunc_override : result.effects + info = is_intr ? CC.NoCallInfo() : result.info + if is_intr || rt_override !== nothing + return CC.CallMeta(rt, result.exct, effects, info) end return result end diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index aa0d425..1aa42d5 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -4,7 +4,7 @@ module Intrinsics -using Base: compilerbarrier, donotdelete +using Base: compilerbarrier using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual @@ -19,15 +19,6 @@ end # Sometimes that's not possible, e.g., because the functionality required for that is # overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those # intrinsics we disable constant folding using a `compilerbarrier(:const)` -# -# NOTE: Side-effectful intrinsics (stores, atomics) use `donotdelete(args...)` in their -# bodies to prevent the optimizer from DCE'ing calls. `donotdelete` is a Julia builtin -# with `effect_free=ALWAYS_FALSE`, which inference propagates through the function body. -# `@assume_effects !:effect_free` does NOT work — `override_effects` can only strengthen -# effects (set ALWAYS_TRUE), not weaken them. Spoofing `ipo_effects` via a custom -# `CC.finish!` override is possible but fragile (must race against `finishinfer!` setting -# `use_const_api` based on pre-override effects). `donotdelete` is the simplest correct -# approach. emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 3c89bd4..faabebf 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -41,10 +41,11 @@ end """ @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired, memory_order::Int, memory_scope::Int) where {T, N} - donotdelete() compilerbarrier(:const, zero(T))::T end end +efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args) cb = ctx.cb tt = ctx.tt @@ -179,10 +180,11 @@ end """ @noinline function atomic_xchg(array::TileArray{T, N}, index, val, memory_order::Int, memory_scope::Int) where {T, N} - donotdelete() compilerbarrier(:const, zero(T)) end end +efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) emit_atomic_rmw!(ctx, args, AtomicXCHG) end @@ -198,10 +200,11 @@ end """ @noinline function atomic_add(array::TileArray{T, N}, index, val, memory_order::Int, memory_scope::Int) where {T, N} - donotdelete() compilerbarrier(:const, zero(T)) end end +efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) emit_atomic_rmw!(ctx, args, AtomicADD) end diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl index 1d42ad5..4db4b46 100644 --- a/src/compiler/intrinsics/memory.jl +++ b/src/compiler/intrinsics/memory.jl @@ -95,10 +95,11 @@ end @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, latency::Union{Int, Nothing}, mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} - donotdelete() nothing end end +efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl index 0b9f332..19a8534 100644 --- a/src/compiler/intrinsics/misc.jl +++ b/src/compiler/intrinsics/misc.jl @@ -3,10 +3,11 @@ # cuda_tile.assert @eval Intrinsics begin @noinline function assert(cond::Bool, message::String) - donotdelete(cond, message) nothing end end +efunc(::typeof(Intrinsics.assert), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.assert), args) cond = @something emit_value!(ctx, args[1]) throw(IRError("assert: cannot resolve condition")) message = @something get_constant(ctx, args[2]) throw(IRError("assert: requires constant message")) diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index c8f1a88..1c6e7c6 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -378,10 +378,11 @@ end latency::Union{Int, Nothing}, allow_tma::Bool, indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - donotdelete() nothing end end +efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), args) cb = ctx.cb tt = ctx.tt From 23c37db9462f2581cc9f0896bca16f49cc87c549 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 9 Feb 2026 17:08:32 +0100 Subject: [PATCH 02/17] Add const-prop tests. --- test/codegen/integration.jl | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl index 2e0cb80..a1503dc 100644 --- a/test/codegen/integration.jl +++ b/test/codegen/integration.jl @@ -688,6 +688,52 @@ end end end end + + @testset "float constant addition folds through addf" begin + @test @filecheck begin + @check_label "entry" + @check_not "addf" + @check "constant Date: Mon, 9 Feb 2026 21:04:37 +0100 Subject: [PATCH 03/17] Simplify intrinsic definitions. --- Project.toml | 2 + src/compiler/interface.jl | 19 ++ src/compiler/intrinsics.jl | 42 +++- src/compiler/intrinsics/arithmetic.jl | 271 ++++++++++++++----------- src/compiler/intrinsics/atomics.jl | 47 +---- src/compiler/intrinsics/conversions.jl | 28 +-- src/compiler/intrinsics/core.jl | 209 ++++--------------- src/compiler/intrinsics/math.jl | 131 ++++-------- src/compiler/intrinsics/memory.jl | 47 ++--- src/compiler/intrinsics/misc.jl | 6 +- src/compiler/intrinsics/views.jl | 74 ++----- 11 files changed, 322 insertions(+), 554 deletions(-) diff --git a/Project.toml b/Project.toml index dd1c4ea..cdff353 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8" CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d" +ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93" [weakdeps] @@ -31,4 +32,5 @@ BFloat16s = "0.6" CompilerCaching = "0.1" CUDA_Compiler_jll = "0.4" CUDA_Tile_jll = "13.1" +ExprTools = "0.1" IRStructurizer = "0.1" diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index ebc5607..9080e87 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -191,6 +191,11 @@ end rt = rt_override !== nothing ? rt_override : cm.rt efunc_override = is_intr ? efunc(f, cm.effects) : nothing effects = efunc_override !== nothing ? efunc_override : cm.effects + # Mark intrinsics as non-consistently-overlayed so callers can't be + # concrete-eval'd (not_callable() bodies would throw at runtime). + if is_intr + effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE) + end info = is_intr ? CC.NoCallInfo() : cm.info info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) @@ -218,6 +223,11 @@ elseif isdefined(CC, :Future) # 1.12–1.13 rt = rt_override !== nothing ? rt_override : cm.rt efunc_override = is_intr ? efunc(f, cm.effects) : nothing effects = efunc_override !== nothing ? efunc_override : cm.effects + # Mark intrinsics as non-consistently-overlayed so callers can't be + # concrete-eval'd (not_callable() bodies would throw at runtime). + if is_intr + effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE) + end info = is_intr ? CC.NoCallInfo() : cm.info info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) @@ -238,6 +248,11 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges rt = rt_override !== nothing ? rt_override : result.rt efunc_override = is_intr ? efunc(f, result.effects) : nothing effects = efunc_override !== nothing ? efunc_override : result.effects + # Mark intrinsics as non-consistently-overlayed so callers can't be + # concrete-eval'd (not_callable() bodies would throw at runtime). + if is_intr + effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE) + end info = is_intr ? CC.NoCallInfo() : result.info if is_intr || rt_override !== nothing return CC.CallMeta(rt, result.exct, effects, info) @@ -247,6 +262,7 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges end # Disable semi-concrete interpretation (broken with overlays per JuliaLang/julia#47349) +# and block concrete eval for intrinsics (not_callable() bodies return dummy values). function CC.concrete_eval_eligible(interp::cuTileInterpreter, @nospecialize(f), result::CC.MethodCallResult, arginfo::CC.ArgInfo, sv::CC.InferenceState) ret = @invoke CC.concrete_eval_eligible(interp::CC.AbstractInterpreter, @@ -254,6 +270,9 @@ function CC.concrete_eval_eligible(interp::cuTileInterpreter, if ret === :semi_concrete_eval return :none end + if ret === :concrete_eval && isintrinsic(f) + return :none + end return ret end diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 1aa42d5..4cb6d0a 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -12,13 +12,41 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal end -# NOTE: Due to JuliaLang/julia#60583, intrinsics may be called during constant evaluation. -# Because of that, such intrinsics (such as basic arithmetic) need to provide an -# implementation that actually computes a valid result using Julia intrinsics. -# -# Sometimes that's not possible, e.g., because the functionality required for that is -# overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those -# intrinsics we disable constant folding using a `compilerbarrier(:const)` +# NOTE: Intrinsics are never directly folded (concrete_eval_eligible returns :none, +# nonoverlayed=ALWAYS_FALSE taints caller effects). However, overlay callers +# with @assume_effects :foldable override the propagated effects, causing the +# compiler to concrete-evaluate through intrinsic bodies (JuliaLang/julia#60583). +# Intrinsics on such paths need callable bodies (function definition form). +# All others use compilerbarrier(:type, nothing) as a dummy body (bare signature). + +using ExprTools: splitdef, combinedef + +""" + @intrinsic signature + @intrinsic function_definition + +Define a Tile IR intrinsic in the `Intrinsics` module. + +A bare signature (e.g. `@intrinsic foo(x)`) creates a dummy body using +`compilerbarrier(:type, nothing)` so body inference returns `Any`. Actual +return types come from `tfunc` overrides in the interpreter. + +A function definition (e.g. `@intrinsic foo(x) = expr`) preserves the body, +providing a callable implementation for concrete evaluation. This is needed +when overlay callers with `@assume_effects :foldable` cause the compiler to +evaluate through intrinsic bodies (JuliaLang/julia#60583). The body should +provide a correct scalar implementation using `Core.Intrinsics`, or return +`nothing` for side-effect-only intrinsics. +""" +macro intrinsic(ex) + if ex isa Expr && ex.head in (:function, :(=)) + funcdef = combinedef(splitdef(ex)) + else + funcdef = Expr(:function, ex, quote compilerbarrier(:type, nothing) end) + end + funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef) + return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef))))) +end emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 6272251..861731b 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -84,53 +84,60 @@ end ## Integer arithmetic # cuda_tile.absi -@eval Intrinsics begin - """Integer absolute value. Compiled to cuda_tile.absi.""" - @noinline absi(x::T) where {T<:Integer} = - ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x) - @noinline absi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a) +@intrinsic absi(x::T) where {T<:Integer} = + ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x) +@intrinsic absi(a::Tile) +function tfunc(::typeof(Intrinsics.absi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absi), args) emit_unop!(ctx, args, encode_AbsIOp!) end # cuda_tile.addi -@eval Intrinsics begin - @noinline addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y) - @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() +@intrinsic addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y) +@intrinsic addi(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.addi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args) emit_binop!(ctx, args, encode_AddIOp!) end # cuda_tile.cldi (ceiling division, toward positive infinity) -@eval Intrinsics begin - @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T)) -end +@intrinsic cldi(x, y, s) +tfunc(::typeof(Intrinsics.cldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingPositiveInf) end # cuda_tile.cmpi -@eval Intrinsics begin - @noinline function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} - if pred === CmpLessThan - s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - elseif pred === CmpLessThanOrEqual - s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y) - elseif pred === CmpGreaterThan - s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x) - elseif pred === CmpGreaterThanOrEqual - s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x) - elseif pred === CmpEqual - Core.Intrinsics.eq_int(x, y) - else # CmpNotEqual - Core.Intrinsics.ne_int(x, y) - end +@intrinsic function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} + if pred === CmpLessThan + s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) + elseif pred === CmpLessThanOrEqual + s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y) + elseif pred === CmpGreaterThan + s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x) + elseif pred === CmpGreaterThanOrEqual + s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x) + elseif pred === CmpEqual + Core.Intrinsics.eq_int(x, y) + else # CmpNotEqual + Core.Intrinsics.ne_int(x, y) end - @noinline cmpi(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate, ::Signedness) where {T<:Integer, S} = - Tile{Bool, S}() +end +@intrinsic cmpi(a::Tile, b::Tile, pred, s) +function tfunc(::typeof(Intrinsics.cmpi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + if t <: Tile + S = t.parameters[2] + return Tile{Bool, S} + end + return nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) cb = ctx.cb @@ -156,10 +163,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) end # cuda_tile.divi (truncating division, toward zero) -@eval Intrinsics begin - @noinline function divi(x::T, y::T, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y) - end +@intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer} + s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness")) @@ -167,22 +172,22 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) end # cuda_tile.fldi (floor division, toward negative infinity) -@eval Intrinsics begin - @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T)) -end +@intrinsic fldi(x, y, s) +tfunc(::typeof(Intrinsics.fldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingNegativeInf) end # cuda_tile.maxi -@eval Intrinsics begin - @noinline function maxi(x::T, y::T, s::Signedness) where {T<:Integer} - lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - ifelse(lt, y, x) - end - @noinline maxi(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = - Tile{T, S}() +@intrinsic function maxi(x::T, y::T, s::Signedness) where {T<:Integer} + lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) + ifelse(lt, y, x) +end +@intrinsic maxi(a::Tile, b::Tile, s) +function tfunc(::typeof(Intrinsics.maxi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness")) @@ -190,13 +195,14 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) end # cuda_tile.mini -@eval Intrinsics begin - @noinline function mini(x::T, y::T, s::Signedness) where {T<:Integer} - lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - ifelse(lt, x, y) - end - @noinline mini(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = - Tile{T, S}() +@intrinsic function mini(x::T, y::T, s::Signedness) where {T<:Integer} + lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) + ifelse(lt, x, y) +end +@intrinsic mini(a::Tile, b::Tile, s) +function tfunc(::typeof(Intrinsics.mini), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness")) @@ -204,40 +210,43 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) end # cuda_tile.muli -@eval Intrinsics begin - @noinline muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) - @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() +@intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) +@intrinsic muli(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.muli), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args) emit_binop!(ctx, args, encode_MulIOp!) end # cuda_tile.mulhii -@eval Intrinsics begin - """High bits of integer multiply (for extended precision arithmetic). Compiled to cuda_tile.mulhii.""" - @noinline function mulhii(x::T, y::T, s::Signedness) where {T<:Integer} - ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T - end - @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = Tile{T, S}() +@intrinsic function mulhii(x::T, y::T, s::Signedness) where {T<:Integer} + ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T +end +@intrinsic mulhii(a::Tile, b::Tile, s) +function tfunc(::typeof(Intrinsics.mulhii), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args) emit_binop!(ctx, args, encode_MulhiIOp!) end # cuda_tile.negi -@eval Intrinsics begin - @noinline negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x) - @noinline negi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a) +@intrinsic negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x) +@intrinsic negi(a::Tile) +function tfunc(::typeof(Intrinsics.negi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args) emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone) end # cuda_tile.remi -@eval Intrinsics begin - @noinline function remi(x::T, y::T, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y) - end +@intrinsic function remi(x::T, y::T, s::Signedness) where {T<:Integer} + s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness")) @@ -245,18 +254,14 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args) end # cuda_tile.shli -@eval Intrinsics begin - @noinline shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T) -end +@intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args) emit_binop!(ctx, args, encode_ShLIOp!) end # cuda_tile.shri -@eval Intrinsics begin - @noinline function shri(x::T, y::Integer, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T) - end +@intrinsic function shri(x::T, y::Integer, s::Signedness) where {T<:Integer} + s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness")) @@ -264,9 +269,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args) end # cuda_tile.subi -@eval Intrinsics begin - @noinline subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) - @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() +@intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) +@intrinsic subi(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.subi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args) emit_binop!(ctx, args, encode_SubIOp!) @@ -276,42 +283,51 @@ end ## Floating-point arithmetic # cuda_tile.absf -@eval Intrinsics begin - @noinline absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x) - @noinline absf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a) +@intrinsic absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x) +@intrinsic absf(a::Tile) +function tfunc(::typeof(Intrinsics.absf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args) emit_unop!(ctx, args, encode_AbsFOp!) end # cuda_tile.addf -@eval Intrinsics begin - @noinline addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y) - @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() +@intrinsic addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y) +@intrinsic addf(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.addf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args) emit_binop!(ctx, args, encode_AddFOp!) end # cuda_tile.cmpf -@eval Intrinsics begin - @noinline function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} - if pred === CmpLessThan - Core.Intrinsics.lt_float(x, y) - elseif pred === CmpLessThanOrEqual - Core.Intrinsics.le_float(x, y) - elseif pred === CmpGreaterThan - Core.Intrinsics.lt_float(y, x) - elseif pred === CmpGreaterThanOrEqual - Core.Intrinsics.le_float(y, x) - elseif pred === CmpEqual - Core.Intrinsics.eq_float(x, y) - else # CmpNotEqual - Core.Intrinsics.ne_float(x, y) - end +@intrinsic function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} + if pred === CmpLessThan + Core.Intrinsics.lt_float(x, y) + elseif pred === CmpLessThanOrEqual + Core.Intrinsics.le_float(x, y) + elseif pred === CmpGreaterThan + Core.Intrinsics.lt_float(y, x) + elseif pred === CmpGreaterThanOrEqual + Core.Intrinsics.le_float(y, x) + elseif pred === CmpEqual + Core.Intrinsics.eq_float(x, y) + else # CmpNotEqual + Core.Intrinsics.ne_float(x, y) + end +end +@intrinsic cmpf(a::Tile, b::Tile, pred) +function tfunc(::typeof(Intrinsics.cmpf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + if t <: Tile + S = t.parameters[2] + return Tile{Bool, S} end - @noinline cmpf(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate) where {T<:AbstractFloat, S} = - Tile{Bool, S}() + return nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) cb = ctx.cb @@ -336,36 +352,44 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) end # cuda_tile.divf -@eval Intrinsics begin - @noinline divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y) - @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() +@intrinsic divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y) +@intrinsic divf(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.divf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args) emit_binop!(ctx, args, encode_DivFOp!) end # cuda_tile.mulf -@eval Intrinsics begin - @noinline mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y) - @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() +@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y) +@intrinsic mulf(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.mulf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args) emit_binop!(ctx, args, encode_MulFOp!) end # cuda_tile.negf -@eval Intrinsics begin - @noinline negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x) - @noinline negf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a) +@intrinsic negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x) +@intrinsic negf(a::Tile) +function tfunc(::typeof(Intrinsics.negf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args) emit_unop!(ctx, args, encode_NegFOp!) end # cuda_tile.subf -@eval Intrinsics begin - @noinline subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y) - @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() +@intrinsic subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y) +@intrinsic subf(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.subf), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args) emit_binop!(ctx, args, encode_SubFOp!) @@ -375,10 +399,11 @@ end ## Boolean arithmetic # cuda_tile.andi -@eval Intrinsics begin - @noinline andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) - """Element-wise logical AND for boolean tiles.""" - @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() +@intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) +@intrinsic andi(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.andi), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) cb = ctx.cb @@ -396,10 +421,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) end # cuda_tile.ori -@eval Intrinsics begin - @noinline ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y) - """Element-wise logical OR for boolean tiles.""" - @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() +@intrinsic ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y) +@intrinsic ori(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.ori), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) cb = ctx.cb @@ -417,10 +443,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) end # cuda_tile.xori -@eval Intrinsics begin - @noinline xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y) - """Element-wise logical XOR for boolean tiles.""" - @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() +@intrinsic xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y) +@intrinsic xori(a::Tile, b::Tile) +function tfunc(::typeof(Intrinsics.xori), argtypes::Vector{Any}) + t = CC.widenconst(argtypes[2]) + t <: Tile ? t : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args) cb = ctx.cb diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index faabebf..79258fa 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -31,19 +31,9 @@ function memory_scope_to_scope(scope::Int) end # cuda_tile.atomic_cas_tko -@eval Intrinsics begin - """ - atomic_cas(array, index, expected, desired, memory_order, memory_scope) - - Atomic compare-and-swap at 0-indexed position. - Returns the original value. - Compiled to cuda_tile.atomic_cas_tko. - """ - @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired, - memory_order::Int, memory_scope::Int) where {T, N} - compilerbarrier(:const, zero(T))::T - end -end +@intrinsic atomic_cas(array, index, expected, desired, + memory_order, memory_scope) +tfunc(::typeof(Intrinsics.atomic_cas), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2])) efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args) @@ -170,19 +160,8 @@ function emit_atomic_rmw!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) end # cuda_tile.atomic_rmw_tko with XCHG -@eval Intrinsics begin - """ - atomic_xchg(array, index, val, memory_order, memory_scope) - - Atomic exchange at 0-indexed position. - Returns the original value. - Compiled to cuda_tile.atomic_rmw_tko with XCHG. - """ - @noinline function atomic_xchg(array::TileArray{T, N}, index, val, - memory_order::Int, memory_scope::Int) where {T, N} - compilerbarrier(:const, zero(T)) - end -end +@intrinsic atomic_xchg(array, index, val, memory_order, memory_scope) +tfunc(::typeof(Intrinsics.atomic_xchg), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2])) efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) @@ -190,19 +169,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) end # cuda_tile.atomic_rmw_tko with ADD -@eval Intrinsics begin - """ - atomic_add(array, index, val, memory_order, memory_scope) - - Atomic addition at 0-indexed position. - Returns the original value. - Compiled to cuda_tile.atomic_rmw_tko with ADD. - """ - @noinline function atomic_add(array::TileArray{T, N}, index, val, - memory_order::Int, memory_scope::Int) where {T, N} - compilerbarrier(:const, zero(T)) - end -end +@intrinsic atomic_add(array, index, val, + memory_order, memory_scope) +tfunc(::typeof(Intrinsics.atomic_add), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2])) efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 6c33afc..6aa879f 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -3,10 +3,8 @@ # TODO: cuda_tile.bitcast # cuda_tile.exti (scalar integer extension) -@eval Intrinsics begin - @noinline function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} - s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x) - end +@intrinsic function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} + s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args) cb = ctx.cb @@ -26,10 +24,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args) end # cuda_tile.ftof (scalar float to float) -@eval Intrinsics begin - @noinline function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} - sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x) - end +@intrinsic function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} + sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args) cb = ctx.cb @@ -48,10 +44,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args) end # cuda_tile.ftoi (scalar float to integer) -@eval Intrinsics begin - @noinline function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} - s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x) - end +@intrinsic function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} + s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args) cb = ctx.cb @@ -71,10 +65,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args) end # cuda_tile.itof (scalar integer to float) -@eval Intrinsics begin - @noinline function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} - s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x) - end +@intrinsic function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} + s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args) cb = ctx.cb @@ -94,9 +86,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args) end # cuda_tile.trunci (scalar integer truncation) -@eval Intrinsics begin - @noinline trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x) -end +@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.trunci), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index b64fbcf..abe3f34 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -19,17 +19,7 @@ function validate_tile_shape(shape, context::String) end # cuda_tile.broadcast -@eval Intrinsics begin - """ - broadcast(tile, shape_val) - - Explicitly broadcast a tile to a target shape. - Compiled to cuda_tile.broadcast. - """ - @noinline function broadcast(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N} - compilerbarrier(:type, nothing) - end -end +@intrinsic broadcast(tile, shape) function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing tile_type = CC.widenconst(argtypes[2]) @@ -109,17 +99,7 @@ function broadcast_tile_to_shape!(cb::CodeBuilder, tt::TypeTable, tv::CGVal, end # cuda_tile.cat -@eval Intrinsics begin - """ - cat(tiles, axis_val) - - Concatenate two tiles along 0-indexed axis. - Compiled to cuda_tile.cat. - """ - @noinline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Integer) where {T, S1, S2} - compilerbarrier(:type, nothing) - end -end +@intrinsic cat(tiles, axis) function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing tuple_type = CC.widenconst(argtypes[2]) @@ -186,17 +166,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args) end # cuda_tile.constant -@eval Intrinsics begin - """ - constant(shape, value, T) - - Create a tile filled with a constant value. - Compiled to cuda_tile.constant. - """ - @noinline function constant(shape::NTuple{N, Int}, value, ::Type{T}) where {N, T} - compilerbarrier(:type, nothing) - end -end +@intrinsic constant(shape, value, T) function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any}) length(argtypes) >= 4 || return nothing shape_arg = argtypes[2] @@ -236,17 +206,7 @@ end # TODO: cuda_tile.entry # cuda_tile.extract -@eval Intrinsics begin - """ - extract(tile, index_val, shape_val) - - Extract a sub-tile from tile at 0-indexed slice indices. - Compiled to cuda_tile.extract. - """ - @noinline function extract(tile::Tile{T}, index::NTuple{N, Int}, shape::NTuple{N, Int}) where {T, N} - compilerbarrier(:type, nothing) - end -end +@intrinsic extract(tile, index, shape) function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any}) length(argtypes) >= 4 || return nothing tile_type = CC.widenconst(argtypes[2]) @@ -300,15 +260,8 @@ end # TODO: cuda_tile.get_global # cuda_tile.get_num_tile_blocks -@eval Intrinsics begin - """ - get_num_tile_blocks(axis)::Int32 - - Get the grid size along the given axis (0=x, 1=y, 2=z). - Compiled to cuda_tile.get_num_tile_blocks. - """ - @noinline get_num_tile_blocks(axis::Integer) = compilerbarrier(:const, zero(Int32)) -end +@intrinsic get_num_tile_blocks(axis) +tfunc(::typeof(Intrinsics.get_num_tile_blocks), argtypes::Vector{Any}) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), args) axis = @something get_constant(ctx, args[1]) throw(IRError("get_num_tile_blocks() axis must be a compile-time constant")) axis in (0, 1, 2) || throw(IRError("get_num_tile_blocks() axis must be 0, 1, or 2, got $axis")) @@ -320,15 +273,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), a end # cuda_tile.get_tile_block_id -@eval Intrinsics begin - """ - get_tile_block_id(axis)::Int32 - - Get the block ID along the given axis (0=x, 1=y, 2=z). - Compiled to cuda_tile.get_tile_block_id. - """ - @noinline get_tile_block_id(axis::Integer) = compilerbarrier(:const, zero(Int32)) -end +@intrinsic get_tile_block_id(axis) +tfunc(::typeof(Intrinsics.get_tile_block_id), argtypes::Vector{Any}) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_tile_block_id), args) axis = @something get_constant(ctx, args[1]) throw(IRError("get_tile_block_id() axis must be a compile-time constant")) axis in (0, 1, 2) || throw(IRError("get_tile_block_id() axis must be 0, 1, or 2, got $axis")) @@ -343,17 +289,7 @@ end # TODO: cuda_tile.global # cuda_tile.iota -@eval Intrinsics begin - """ - iota(shape, T) - - Create a 1D tile with values [0, 1, 2, ..., shape[1]-1] (0-indexed). - Compiled to cuda_tile.iota. - """ - @noinline function iota(shape::NTuple{1, Int}, ::Type{T}) where {T} - compilerbarrier(:type, nothing) - end -end +@intrinsic iota(shape, T) function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing shape_arg = argtypes[2] @@ -387,17 +323,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args) end # cuda_tile.mmaf, cuda_tile.mmai -@eval Intrinsics begin - """ - mma(a, b, acc) - - Matrix-multiply-accumulate: result = a @ b + acc. - Compiled to cuda_tile.mmaf or cuda_tile.mmai. - """ - @noinline function mma(a::Tile{T1}, b::Tile{T2}, acc::Tile{T3, SC}) where {T1, T2, T3, SC} - Tile{T3, SC}() - end -end +@intrinsic mma(a, b, acc) +tfunc(::typeof(Intrinsics.mma), argtypes::Vector{Any}) = CC.widenconst(argtypes[4]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args) cb = ctx.cb @@ -415,16 +342,16 @@ end # TODO: cuda_tile.module # cuda_tile.offset -@eval Intrinsics begin - """ - offset(base, offsets) - - Compute base_ptr + offsets for each element of offsets tile (element-scaled). - Returns a tile of pointers. Compiled to cuda_tile.offset. - """ - @noinline function offset(base::Ptr{T}, offsets::Tile{I, S}) where {T, I <: Integer, S} - Tile{Ptr{T}, S}() - end +@intrinsic offset(base, offsets) +function tfunc(::typeof(Intrinsics.offset), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + base_type = CC.widenconst(argtypes[2]) + base_type <: Ptr || return nothing + offsets_type = CC.widenconst(argtypes[3]) + offsets_type <: Tile || return nothing + T = eltype(base_type) + S = offsets_type.parameters[2] + return Tile{Ptr{T}, S} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.offset), args) cb = ctx.cb @@ -469,17 +396,7 @@ end # TODO: cudatile.pack # cuda_tile.permute -@eval Intrinsics begin - """ - permute(tile, perm_val) - - Permute tile dimensions according to 0-indexed permutation. - Compiled to cuda_tile.permute. - """ - @noinline function permute(tile::Tile{T, S}, perm::NTuple{N, Int}) where {T, S, N} - compilerbarrier(:type, nothing) - end -end +@intrinsic permute(tile, perm) function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing tile_type = CC.widenconst(argtypes[2]) @@ -529,17 +446,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args) end # cuda_tile.transpose -@eval Intrinsics begin - """ - transpose(tile) - - Transpose a 2D tile, swapping its dimensions. - Compiled to cuda_tile.permute with perm=(1, 0). - """ - @noinline function transpose(tile::Tile{T}) where {T} - compilerbarrier(:type, nothing) - end -end +@intrinsic transpose(tile) function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any}) length(argtypes) >= 2 || return nothing tile_type = CC.widenconst(argtypes[2]) @@ -576,24 +483,7 @@ end # cuda_tile.reduce -@eval Intrinsics begin - """ - reduce(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple) -> Tuple{Tile...} - - Reduce tiles along a 0-indexed axis using combiner `f` with per-operand - identity values. Accepts and returns tuples of tiles; single-operand - callers wrap in 1-tuples and unwrap with `[1]`. - Compiled to cuda_tile.reduce. - """ - @noinline function reduce(tiles::Tuple{Tile{T, S}}, axis::Integer, f, - identities::Tuple{Any}) where {T, S} - compilerbarrier(:type, nothing) - end - @noinline function reduce(tiles::Tuple{Tile{T1, S}, Tile{T2, S}}, axis::Integer, f, - identities::Tuple{Any, Any}) where {T1, T2, S} - compilerbarrier(:type, nothing) - end -end +@intrinsic reduce(tiles, axis, f, identities) function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing tuple_type = CC.widenconst(argtypes[2]) @@ -724,17 +614,7 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer = IntegerIdentityVal(to_uint128(T(val)), dtype, T) # cuda_tile.reshape -@eval Intrinsics begin - """ - reshape(tile, shape_val) - - Reshape a tile to a new shape (same total elements). - Compiled to cuda_tile.reshape. - """ - @noinline function reshape(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N} - compilerbarrier(:type, nothing) - end -end +@intrinsic reshape(tile, shape) function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing tile_type = CC.widenconst(argtypes[2]) @@ -803,21 +683,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args) end # cuda_tile.scan -@eval Intrinsics begin - """ - scan(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple, reverse=false) -> Tuple{Tile...} - - Parallel prefix scan along a 0-indexed axis using combiner `f` with - per-operand identity values. Accepts and returns tuples of tiles; - single-operand callers wrap in 1-tuples and unwrap with `[1]`. - `reverse=true` for a reverse (suffix) scan. - Compiled to cuda_tile.scan. - """ - @noinline function scan(tiles::Tuple{Tile{T, S}}, axis::Integer, f, - identities::Tuple{Any}, reverse::Bool=false) where {T, S} - compilerbarrier(:type, nothing) - end -end +@intrinsic scan(tiles, axis, f, identities, reverse=false) function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any}) length(argtypes) >= 2 || return nothing tuple_type = CC.widenconst(argtypes[2]) @@ -916,17 +782,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) end # cuda_tile.select -@eval Intrinsics begin - """ - select(cond, x, y) - - Element-wise conditional selection. - Compiled to cuda_tile.select. - """ - @noinline select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y) - @noinline function select(cond::Tile{Bool, S}, x::Tile{T, S}, y::Tile{T, S}) where {T, S} - Tile{T, S}() - end +@intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y) +@intrinsic select(cond::Tile, x, y) +function tfunc(::typeof(Intrinsics.select), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + cond_type = CC.widenconst(argtypes[2]) + cond_type <: Tile ? CC.widenconst(argtypes[3]) : nothing end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args) cb = ctx.cb @@ -947,10 +808,8 @@ end # These are codegen-only reinterpret intrinsics for map(f, tile). # to_scalar: jltype becomes scalar T (for overlay dispatch), but IR value stays shaped. # from_scalar: restores jltype to Tile{T, S}. -@eval Intrinsics begin - @noinline to_scalar(tile::Tile{T, S}) where {T, S} = compilerbarrier(:type, nothing) - @noinline from_scalar(x::T, ::Type{S}) where {T, S} = Tile{T, S}() -end +@intrinsic to_scalar(tile) +@intrinsic from_scalar(x, S) function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing T = CC.widenconst(argtypes[2]) diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl index ded13df..8bd3e93 100644 --- a/src/compiler/intrinsics/math.jl +++ b/src/compiler/intrinsics/math.jl @@ -3,41 +3,29 @@ ## Floating-point math # cuda_tile.ceil -@eval Intrinsics begin - """Ceiling (round toward positive infinity). Compiled to cuda_tile.ceil.""" - @noinline ceil(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline ceil(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic ceil(x) +tfunc(::typeof(Intrinsics.ceil), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args) emit_unop!(ctx, args, encode_CeilOp!) end # cuda_tile.cos -@eval Intrinsics begin - """Cosine. Compiled to cuda_tile.cos.""" - @noinline cos(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline cos(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic cos(x) +tfunc(::typeof(Intrinsics.cos), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args) emit_unop!(ctx, args, encode_CosOp!) end # cuda_tile.cosh -@eval Intrinsics begin - """Hyperbolic cosine. Compiled to cuda_tile.cosh.""" - @noinline cosh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline cosh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic cosh(x) +tfunc(::typeof(Intrinsics.cosh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args) emit_unop!(ctx, args, encode_CosHOp!) end # cuda_tile.exp2 -@eval Intrinsics begin - """Base-2 exponential (2^x). Compiled to cuda_tile.exp2.""" - @noinline exp2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline exp2(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic exp2(x, flush_to_zero=false) +tfunc(::typeof(Intrinsics.exp2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) cb = ctx.cb @@ -52,11 +40,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) end # cuda_tile.exp -@eval Intrinsics begin - """Natural exponential (e^x). Compiled to cuda_tile.exp.""" - @noinline exp(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline exp(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic exp(x) +tfunc(::typeof(Intrinsics.exp), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) cb = ctx.cb @@ -69,21 +54,15 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) end # cuda_tile.floor -@eval Intrinsics begin - """Floor (round toward negative infinity). Compiled to cuda_tile.floor.""" - @noinline floor(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline floor(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic floor(x) +tfunc(::typeof(Intrinsics.floor), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args) emit_unop!(ctx, args, encode_FloorOp!) end # cuda_tile.fma -@eval Intrinsics begin - """Fused multiply-add: a * b + c. Compiled to cuda_tile.fma.""" - @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic fma(x, y, z) +tfunc(::typeof(Intrinsics.fma), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) cb = ctx.cb @@ -99,11 +78,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) end # cuda_tile.log2 -@eval Intrinsics begin - """Base-2 logarithm. Compiled to cuda_tile.log2.""" - @noinline log2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline log2(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic log2(x) +tfunc(::typeof(Intrinsics.log2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) cb = ctx.cb @@ -116,11 +92,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) end # cuda_tile.log -@eval Intrinsics begin - """Element-wise natural logarithm. Compiled to cuda_tile.log.""" - @noinline log(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline log(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic log(x) +tfunc(::typeof(Intrinsics.log), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) cb = ctx.cb @@ -133,49 +106,36 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) end # cuda_tile.maxf -@eval Intrinsics begin - @noinline maxf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x > y || isnan(x), x, y) - @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic maxf(x, y) +tfunc(::typeof(Intrinsics.maxf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args) emit_binop!(ctx, args, encode_MaxFOp!) end # cuda_tile.minf -@eval Intrinsics begin - @noinline minf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x < y || isnan(x), x, y) - @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic minf(x, y) +tfunc(::typeof(Intrinsics.minf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args) emit_binop!(ctx, args, encode_MinFOp!) end # cuda_tile.pow -@eval Intrinsics begin - """Element-wise power. Compiled to cuda_tile.pow.""" - @noinline pow(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic pow(x, y) +tfunc(::typeof(Intrinsics.pow), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args) emit_binop!(ctx, args, encode_PowOp!) end # cuda_tile.remf -@eval Intrinsics begin - """Element-wise floating-point remainder. Compiled to cuda_tile.remf.""" - @noinline remf(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic remf(x, y) +tfunc(::typeof(Intrinsics.remf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args) emit_binop!(ctx, args, encode_RemFOp!) end # cuda_tile.rsqrt -@eval Intrinsics begin - """Element-wise reciprocal square root. Compiled to cuda_tile.rsqrt.""" - @noinline rsqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline rsqrt(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic rsqrt(x, flush_to_zero=false) +tfunc(::typeof(Intrinsics.rsqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) cb = ctx.cb @@ -190,31 +150,22 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) end # cuda_tile.sin -@eval Intrinsics begin - """Element-wise sine. Compiled to cuda_tile.sin.""" - @noinline sin(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline sin(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic sin(x) +tfunc(::typeof(Intrinsics.sin), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args) emit_unop!(ctx, args, encode_SinOp!) end # cuda_tile.sinh -@eval Intrinsics begin - """Element-wise hyperbolic sine. Compiled to cuda_tile.sinh.""" - @noinline sinh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline sinh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic sinh(x) +tfunc(::typeof(Intrinsics.sinh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args) emit_unop!(ctx, args, encode_SinHOp!) end # cuda_tile.sqrt -@eval Intrinsics begin - """Element-wise square root. Compiled to cuda_tile.sqrt.""" - @noinline sqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline sqrt(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic sqrt(x) +tfunc(::typeof(Intrinsics.sqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) cb = ctx.cb @@ -227,21 +178,15 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) end # cuda_tile.tan -@eval Intrinsics begin - """Element-wise tangent. Compiled to cuda_tile.tan.""" - @noinline tan(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline tan(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic tan(x) +tfunc(::typeof(Intrinsics.tan), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args) emit_unop!(ctx, args, encode_TanOp!) end # cuda_tile.tanh -@eval Intrinsics begin - """Element-wise hyperbolic tangent. Compiled to cuda_tile.tanh.""" - @noinline tanh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline tanh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic tanh(x) +tfunc(::typeof(Intrinsics.tanh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args) emit_unop!(ctx, args, encode_TanHOp!) end diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl index 4db4b46..f7bf9e5 100644 --- a/src/compiler/intrinsics/memory.jl +++ b/src/compiler/intrinsics/memory.jl @@ -3,23 +3,16 @@ # TODO: cuda_tile.join_tokens # cuda_tile.load_ptr_tko -@eval Intrinsics begin - """ - load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing) - - Load values from a tile of pointers. - If mask is provided, masked-out positions return the padding value. - Compiled to cuda_tile.load_ptr_tko. - - Note: TMA (allow_tma) is not applicable for pointer-based loads as they - support irregular access patterns incompatible with TMA requirements. - """ - @noinline function load_ptr_tko(ptrs::Tile{Ptr{T}, S}, - latency::Union{Int, Nothing}=nothing, - mask::Union{Tile{Bool, S}, Nothing}=nothing, - padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S} - Tile{T, S}() - end +@intrinsic load_ptr_tko(ptrs, latency=nothing, mask=nothing, padding=nothing) +function tfunc(::typeof(Intrinsics.load_ptr_tko), argtypes::Vector{Any}) + length(argtypes) >= 2 || return nothing + ptrs_type = CC.widenconst(argtypes[2]) + ptrs_type <: Tile || return nothing + ptr_type = eltype(ptrs_type) + ptr_type <: Ptr || return nothing + T = eltype(ptr_type) + S = ptrs_type.parameters[2] + return Tile{T, S} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args) cb = ctx.cb @@ -81,22 +74,10 @@ end # TODO: cuda_tile.make_token # cuda_tile.store_ptr_tko -@eval Intrinsics begin - """ - store_ptr_tko(ptrs, values, latency, mask=nothing) - - Store values to a tile of pointers. - If mask is provided, masked-out positions are not written. - Compiled to cuda_tile.store_ptr_tko. - - Note: TMA (allow_tma) is not applicable for pointer-based stores as they - support irregular access patterns incompatible with TMA requirements. - """ - @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, - latency::Union{Int, Nothing}, - mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} - nothing - end +@intrinsic function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, + latency::Union{Int, Nothing}, + mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} + nothing end efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl index 19a8534..2a0a784 100644 --- a/src/compiler/intrinsics/misc.jl +++ b/src/compiler/intrinsics/misc.jl @@ -1,10 +1,8 @@ # miscellaneous intrinsics # cuda_tile.assert -@eval Intrinsics begin - @noinline function assert(cond::Bool, message::String) - nothing - end +@intrinsic function assert(cond::Bool, message::String) + nothing end efunc(::typeof(Intrinsics.assert), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index 1c6e7c6..fd1bdde 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -24,17 +24,8 @@ function get_padding_value(ctx::CGCtx, args) end # cuda_tile.get_index_space_shape -@eval Intrinsics begin - """ - get_index_space_shape(pv::PartitionView, axis) -> Int32 - - Get the number of tiles along the given axis (0-indexed). - Compiled to cuda_tile.get_index_space_shape. - """ - @noinline function get_index_space_shape(pv::PartitionView{T, N, Shape}, axis::Integer) where {T, N, Shape} - compilerbarrier(:const, zero(Int32)) - end -end +@intrinsic get_index_space_shape(pv, axis) +tfunc(::typeof(Intrinsics.get_index_space_shape), argtypes::Vector{Any}) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_index_space_shape), args) cb = ctx.cb tt = ctx.tt @@ -69,20 +60,7 @@ end # TODO: cuda_tile.get_tensor_shape # cuda_tile.load_view_tko -@eval Intrinsics begin - """ - load_partition_view(pv::PartitionView, latency, allow_tma, index...) -> Tile - - Load a tile from a partition view at the given 0-indexed tile coordinates. - Compiled to cuda_tile.load_view_tko. - """ - @noinline function load_partition_view(pv::PartitionView{T, N, Shape}, - latency::Union{Int, Nothing}, - allow_tma::Bool, - indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - compilerbarrier(:type, nothing) - end -end +@intrinsic load_partition_view(pv, latency, allow_tma, indices) function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any}) length(argtypes) >= 2 || return nothing pv_type = CC.widenconst(argtypes[2]) @@ -172,19 +150,7 @@ function pad_indices(ctx::CGCtx, index_vals::Vector{Value}, ndim::Int, idx_type: end # cuda_tile.make_partition_view -@eval Intrinsics begin - """ - make_partition_view(tv::TensorView, shape_val, padding_mode, order) -> PartitionView - - Create a PartitionView from a TensorView with the given tile shape. - The `order` parameter (NTuple{N,Int} or nothing) specifies - the logical-to-physical dimension mapping (1-indexed), or identity if nothing. - Compiled to cuda_tile.make_partition_view. - """ - @noinline function make_partition_view(tv::TensorView{T, N}, shape::NTuple{M, Int}, padding_mode::Int, order) where {T, N, M} - compilerbarrier(:type, nothing) - end -end +@intrinsic make_partition_view(tv, shape, padding_mode, order) function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any}) length(argtypes) >= 3 || return nothing tv_type = CC.widenconst(argtypes[2]) @@ -336,16 +302,8 @@ function filter_dynamic_strides(stride_vals::Vector{Value}, tv_strides::Vector{I end # cuda_tile.make_tensor_view -@eval Intrinsics begin - """ - make_tensor_view(arr::TileArray) -> TensorView - - Create a TensorView from a TileArray. - Compiled to cuda_tile.make_tensor_view. - """ - @noinline function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N} - TensorView{T, N}() - end +@intrinsic function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N} + TensorView{T, N}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args) array_arg = args[1] @@ -366,20 +324,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args end # cuda_tile.store_view_tko -@eval Intrinsics begin - """ - store_partition_view(pv::PartitionView, tile, latency, allow_tma, index...) -> Nothing - - Store a tile to a partition view at the given 0-indexed tile coordinates. - Compiled to cuda_tile.store_view_tko. - """ - @noinline function store_partition_view(pv::PartitionView{T, N, Shape}, - tile::Tile{T}, - latency::Union{Int, Nothing}, - allow_tma::Bool, - indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - nothing - end +@intrinsic function store_partition_view(pv::PartitionView{T, N, Shape}, + tile::Tile{T}, + latency::Union{Int, Nothing}, + allow_tma::Bool, + indices::NTuple{M, <:Integer}) where {T, N, Shape, M} + nothing end efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) From 42391fb29c7a8468b9e5afc42398a5e3ff5ce765 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 9 Feb 2026 21:47:12 +0100 Subject: [PATCH 04/17] Detect when intrinsics are executed by the compiler. --- src/compiler/intrinsics.jl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 4cb6d0a..9505bcf 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -4,7 +4,7 @@ module Intrinsics -using Base: compilerbarrier +using Base: compilerbarrier, inferencebarrier using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual @@ -39,10 +39,16 @@ provide a correct scalar implementation using `Core.Intrinsics`, or return `nothing` for side-effect-only intrinsics. """ macro intrinsic(ex) - if ex isa Expr && ex.head in (:function, :(=)) - funcdef = combinedef(splitdef(ex)) + funcdef = if ex isa Expr && ex.head in (:function, :(=)) + combinedef(splitdef(ex)) else - funcdef = Expr(:function, ex, quote compilerbarrier(:type, nothing) end) + body = quote + if inferencebarrier(true)::Bool + error("Intrinsic $(string(ex)) cannot be evaluated at compile time") + end + compilerbarrier(:type, nothing) + end + Expr(:function, ex, body) end funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef) return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef))))) From 7178f62cd9768fce16404829233aa5f16e8f4c4a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Feb 2026 09:36:58 +0100 Subject: [PATCH 05/17] Remove more intrinsics bodies and pass through lattice. --- src/compiler/interface.jl | 11 +- src/compiler/intrinsics.jl | 10 +- src/compiler/intrinsics/arithmetic.jl | 169 ++++++++----------------- src/compiler/intrinsics/atomics.jl | 6 +- src/compiler/intrinsics/conversions.jl | 38 +++++- src/compiler/intrinsics/core.jl | 90 +++++-------- src/compiler/intrinsics/math.jl | 38 +++--- src/compiler/intrinsics/memory.jl | 14 +- src/compiler/intrinsics/misc.jl | 5 +- src/compiler/intrinsics/views.jl | 33 +++-- 10 files changed, 173 insertions(+), 241 deletions(-) diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index 9080e87..f69fc36 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -79,7 +79,7 @@ CC.may_discard_trees(::cuTileInterpreter) = false # Per-intrinsic return type overrides. # Returns nothing when no override applies (fallback). -tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing +tfunc(𝕃, @nospecialize(f), @nospecialize args...) = nothing # Per-intrinsic effect overrides. # Returns nothing when no override applies (fallback). @@ -179,7 +179,8 @@ end arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing}, sv::CC.InferenceState, max_methods::Int) is_intr = isintrinsic(f) - rt_override = tfunc(f, arginfo.argtypes) + 𝕃 = CC.typeinf_lattice(interp) + rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...) subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv) !is_intr && rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() @@ -211,7 +212,8 @@ elseif isdefined(CC, :Future) # 1.12–1.13 arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.InferenceState, max_methods::Int) is_intr = isintrinsic(f) - rt_override = tfunc(f, arginfo.argtypes) + 𝕃 = CC.typeinf_lattice(interp) + rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...) subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv) !is_intr && rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() @@ -244,7 +246,8 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges sv::CC.AbsIntState, max_methods::Int) _infer_subprogram(interp, f, arginfo, si, nothing, sv) # side-effect only is_intr = isintrinsic(f) - rt_override = tfunc(f, arginfo.argtypes) + 𝕃 = CC.typeinf_lattice(interp) + rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...) rt = rt_override !== nothing ? rt_override : result.rt efunc_override = is_intr ? efunc(f, result.effects) : nothing effects = efunc_override !== nothing ? efunc_override : result.effects diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 9505bcf..df0fc48 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -12,12 +12,10 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal end -# NOTE: Intrinsics are never directly folded (concrete_eval_eligible returns :none, -# nonoverlayed=ALWAYS_FALSE taints caller effects). However, overlay callers -# with @assume_effects :foldable override the propagated effects, causing the -# compiler to concrete-evaluate through intrinsic bodies (JuliaLang/julia#60583). -# Intrinsics on such paths need callable bodies (function definition form). -# All others use compilerbarrier(:type, nothing) as a dummy body (bare signature). +# NOTE: Intrinsics use bare signatures with dummy bodies (compilerbarrier(:type, nothing)). +# Return types are provided by tfunc overrides in the interpreter. +# Const-prop for overlay callers happens via @assume_effects :foldable at the +# overlay level, not through intrinsic bodies. using ExprTools: splitdef, combinedef diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 861731b..05a0a2c 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -84,31 +84,24 @@ end ## Integer arithmetic # cuda_tile.absi -@intrinsic absi(x::T) where {T<:Integer} = - ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x) -@intrinsic absi(a::Tile) -function tfunc(::typeof(Intrinsics.absi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +@intrinsic absi(x::Integer) +@intrinsic absi(x::Tile{<:Integer}) +tfunc(𝕃, ::typeof(Intrinsics.absi), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absi), args) emit_unop!(ctx, args, encode_AbsIOp!) end # cuda_tile.addi -@intrinsic addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y) -@intrinsic addi(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.addi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +@intrinsic addi(x::T, y::T) where {T<:Integer} +@intrinsic addi(a::Tile{T}, b::Tile{T}) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.addi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args) emit_binop!(ctx, args, encode_AddIOp!) end # cuda_tile.cldi (ceiling division, toward positive infinity) @intrinsic cldi(x, y, s) -tfunc(::typeof(Intrinsics.cldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.cldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingPositiveInf) @@ -131,13 +124,13 @@ end end end @intrinsic cmpi(a::Tile, b::Tile, pred, s) -function tfunc(::typeof(Intrinsics.cmpi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s)) + t = CC.widenconst(x) if t <: Tile S = t.parameters[2] return Tile{Bool, S} end - return nothing + return Bool end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) cb = ctx.cb @@ -166,6 +159,8 @@ end @intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer} s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y) end +@intrinsic divi(a::Tile, b::Tile, s) +tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingZero) @@ -173,7 +168,7 @@ end # cuda_tile.fldi (floor division, toward negative infinity) @intrinsic fldi(x, y, s) -tfunc(::typeof(Intrinsics.fldi), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.fldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingNegativeInf) @@ -185,25 +180,16 @@ end ifelse(lt, y, x) end @intrinsic maxi(a::Tile, b::Tile, s) -function tfunc(::typeof(Intrinsics.maxi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness")) emit_binop!(ctx, args, encode_MaxIOp!; signedness) end # cuda_tile.mini -@intrinsic function mini(x::T, y::T, s::Signedness) where {T<:Integer} - lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - ifelse(lt, x, y) -end +@intrinsic mini(x::T, y::T, s::Signedness) where {T<:Integer} @intrinsic mini(a::Tile, b::Tile, s) -function tfunc(::typeof(Intrinsics.mini), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.mini), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness")) emit_binop!(ctx, args, encode_MinIOp!; signedness) @@ -212,42 +198,31 @@ end # cuda_tile.muli @intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) @intrinsic muli(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.muli), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args) emit_binop!(ctx, args, encode_MulIOp!) end # cuda_tile.mulhii -@intrinsic function mulhii(x::T, y::T, s::Signedness) where {T<:Integer} - ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T -end +@intrinsic mulhii(x::T, y::T, s::Signedness) where {T<:Integer} @intrinsic mulhii(a::Tile, b::Tile, s) -function tfunc(::typeof(Intrinsics.mulhii), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.mulhii), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args) emit_binop!(ctx, args, encode_MulhiIOp!) end # cuda_tile.negi -@intrinsic negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x) +@intrinsic negi(x::T) where {T<:Integer} @intrinsic negi(a::Tile) -function tfunc(::typeof(Intrinsics.negi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.negi), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args) emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone) end # cuda_tile.remi -@intrinsic function remi(x::T, y::T, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y) -end +@intrinsic remi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic remi(a::Tile, b::Tile, s) +tfunc(𝕃, ::typeof(Intrinsics.remi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness")) emit_binop!(ctx, args, encode_RemIOp!; signedness) @@ -255,14 +230,16 @@ end # cuda_tile.shli @intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T) +@intrinsic shli(a::Tile, b::Tile) +tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args) emit_binop!(ctx, args, encode_ShLIOp!) end # cuda_tile.shri -@intrinsic function shri(x::T, y::Integer, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T) -end +@intrinsic shri(x::T, y::Integer, s::Signedness) where {T<:Integer} +@intrinsic shri(a::Tile, b::Tile, s) +tfunc(𝕃, ::typeof(Intrinsics.shri), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness")) emit_binop!(ctx, args, encode_ShRIOp!; signedness) @@ -271,10 +248,7 @@ end # cuda_tile.subi @intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) @intrinsic subi(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.subi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args) emit_binop!(ctx, args, encode_SubIOp!) end @@ -283,51 +257,31 @@ end ## Floating-point arithmetic # cuda_tile.absf -@intrinsic absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x) +@intrinsic absf(x::T) where {T<:AbstractFloat} @intrinsic absf(a::Tile) -function tfunc(::typeof(Intrinsics.absf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.absf), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args) emit_unop!(ctx, args, encode_AbsFOp!) end # cuda_tile.addf -@intrinsic addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y) +@intrinsic addf(x::T, y::T) where {T<:AbstractFloat} @intrinsic addf(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.addf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.addf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args) emit_binop!(ctx, args, encode_AddFOp!) end # cuda_tile.cmpf -@intrinsic function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} - if pred === CmpLessThan - Core.Intrinsics.lt_float(x, y) - elseif pred === CmpLessThanOrEqual - Core.Intrinsics.le_float(x, y) - elseif pred === CmpGreaterThan - Core.Intrinsics.lt_float(y, x) - elseif pred === CmpGreaterThanOrEqual - Core.Intrinsics.le_float(y, x) - elseif pred === CmpEqual - Core.Intrinsics.eq_float(x, y) - else # CmpNotEqual - Core.Intrinsics.ne_float(x, y) - end -end +@intrinsic cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} @intrinsic cmpf(a::Tile, b::Tile, pred) -function tfunc(::typeof(Intrinsics.cmpf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.cmpf), @nospecialize(x), @nospecialize(y), @nospecialize(pred)) + t = CC.widenconst(x) if t <: Tile S = t.parameters[2] return Tile{Bool, S} end - return nothing + return Bool end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) cb = ctx.cb @@ -352,45 +306,33 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) end # cuda_tile.divf -@intrinsic divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y) +@intrinsic divf(x::T, y::T) where {T<:AbstractFloat} @intrinsic divf(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.divf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.divf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args) emit_binop!(ctx, args, encode_DivFOp!) end # cuda_tile.mulf -@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y) +@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} @intrinsic mulf(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.mulf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.mulf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args) emit_binop!(ctx, args, encode_MulFOp!) end # cuda_tile.negf -@intrinsic negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x) +@intrinsic negf(x::T) where {T<:AbstractFloat} @intrinsic negf(a::Tile) -function tfunc(::typeof(Intrinsics.negf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.negf), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args) emit_unop!(ctx, args, encode_NegFOp!) end # cuda_tile.subf -@intrinsic subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y) +@intrinsic subf(x::T, y::T) where {T<:AbstractFloat} @intrinsic subf(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.subf), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.subf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args) emit_binop!(ctx, args, encode_SubFOp!) end @@ -401,10 +343,7 @@ end # cuda_tile.andi @intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) @intrinsic andi(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.andi), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) cb = ctx.cb tt = ctx.tt @@ -421,12 +360,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) end # cuda_tile.ori -@intrinsic ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y) +@intrinsic ori(x::T, y::T) where {T<:Integer} @intrinsic ori(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.ori), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) cb = ctx.cb tt = ctx.tt @@ -443,12 +379,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) end # cuda_tile.xori -@intrinsic xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y) +@intrinsic xori(x::T, y::T) where {T<:Integer} @intrinsic xori(a::Tile, b::Tile) -function tfunc(::typeof(Intrinsics.xori), argtypes::Vector{Any}) - t = CC.widenconst(argtypes[2]) - t <: Tile ? t : nothing -end +tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 79258fa..9c480bf 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -33,7 +33,7 @@ end # cuda_tile.atomic_cas_tko @intrinsic atomic_cas(array, index, expected, desired, memory_order, memory_scope) -tfunc(::typeof(Intrinsics.atomic_cas), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2])) +tfunc(𝕃, ::typeof(Intrinsics.atomic_cas), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args) @@ -161,7 +161,7 @@ end # cuda_tile.atomic_rmw_tko with XCHG @intrinsic atomic_xchg(array, index, val, memory_order, memory_scope) -tfunc(::typeof(Intrinsics.atomic_xchg), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2])) +tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) @@ -171,7 +171,7 @@ end # cuda_tile.atomic_rmw_tko with ADD @intrinsic atomic_add(array, index, val, memory_order, memory_scope) -tfunc(::typeof(Intrinsics.atomic_add), argtypes::Vector{Any}) = eltype(CC.widenconst(argtypes[2])) +tfunc(𝕃, ::typeof(Intrinsics.atomic_add), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 6aa879f..638b05d 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -6,6 +6,12 @@ @intrinsic function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x) end +function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) + tgt = CC.widenconst(target_type) + T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args) cb = ctx.cb tt = ctx.tt @@ -24,8 +30,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args) end # cuda_tile.ftof (scalar float to float) -@intrinsic function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} - sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x) +@intrinsic ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} +function tfunc(𝕃, ::typeof(Intrinsics.ftof), @nospecialize(x), @nospecialize(target_type)) + tgt = CC.widenconst(target_type) + T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args) cb = ctx.cb @@ -44,8 +54,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args) end # cuda_tile.ftoi (scalar float to integer) -@intrinsic function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} - s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x) +@intrinsic ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.ftoi), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) + tgt = CC.widenconst(target_type) + T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args) cb = ctx.cb @@ -65,8 +79,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args) end # cuda_tile.itof (scalar integer to float) -@intrinsic function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} - s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x) +@intrinsic itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} +function tfunc(𝕃, ::typeof(Intrinsics.itof), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) + tgt = CC.widenconst(target_type) + T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args) cb = ctx.cb @@ -86,7 +104,13 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args) end # cuda_tile.trunci (scalar integer truncation) -@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x) +@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.trunci), @nospecialize(x), @nospecialize(target_type)) + tgt = CC.widenconst(target_type) + T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.trunci), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index abe3f34..cfbba8a 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -20,11 +20,10 @@ end # cuda_tile.broadcast @intrinsic broadcast(tile, shape) -function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.broadcast), @nospecialize(tile), @nospecialize(shape_arg)) + tile_type = CC.widenconst(tile) tile_type <: Tile || return nothing - shape_arg = argtypes[3] + shape_arg = shape_arg isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tile_type) @@ -100,11 +99,9 @@ end # cuda_tile.cat @intrinsic cat(tiles, axis) -function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tuple_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.cat), @nospecialize(tiles), @nospecialize(axis_arg)) + tuple_type = CC.widenconst(tiles) tuple_type <: Tuple{Tile, Tile} || return nothing - axis_arg = argtypes[3] isa(axis_arg, CC.Const) || return nothing axis = axis_arg.val t1_type = tuple_type.parameters[1] @@ -167,12 +164,10 @@ end # cuda_tile.constant @intrinsic constant(shape, value, T) -function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any}) - length(argtypes) >= 4 || return nothing - shape_arg = argtypes[2] +function tfunc(𝕃, ::typeof(Intrinsics.constant), @nospecialize(shape_arg), @nospecialize(value), @nospecialize(type_arg_lat)) isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val - type_arg = CC.widenconst(argtypes[4]) + type_arg = CC.widenconst(type_arg_lat) type_arg <: Type || return nothing T = type_arg.parameters[1] return Tile{T, Tuple{shape...}} @@ -207,11 +202,9 @@ end # cuda_tile.extract @intrinsic extract(tile, index, shape) -function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any}) - length(argtypes) >= 4 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.extract), @nospecialize(tile_lat), @nospecialize(index), @nospecialize(shape_arg)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing - shape_arg = argtypes[4] isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tile_type) @@ -261,7 +254,7 @@ end # cuda_tile.get_num_tile_blocks @intrinsic get_num_tile_blocks(axis) -tfunc(::typeof(Intrinsics.get_num_tile_blocks), argtypes::Vector{Any}) = Int32 +tfunc(𝕃, ::typeof(Intrinsics.get_num_tile_blocks), @nospecialize(axis)) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), args) axis = @something get_constant(ctx, args[1]) throw(IRError("get_num_tile_blocks() axis must be a compile-time constant")) axis in (0, 1, 2) || throw(IRError("get_num_tile_blocks() axis must be 0, 1, or 2, got $axis")) @@ -274,7 +267,7 @@ end # cuda_tile.get_tile_block_id @intrinsic get_tile_block_id(axis) -tfunc(::typeof(Intrinsics.get_tile_block_id), argtypes::Vector{Any}) = Int32 +tfunc(𝕃, ::typeof(Intrinsics.get_tile_block_id), @nospecialize(axis)) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_tile_block_id), args) axis = @something get_constant(ctx, args[1]) throw(IRError("get_tile_block_id() axis must be a compile-time constant")) axis in (0, 1, 2) || throw(IRError("get_tile_block_id() axis must be 0, 1, or 2, got $axis")) @@ -290,12 +283,10 @@ end # cuda_tile.iota @intrinsic iota(shape, T) -function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - shape_arg = argtypes[2] +function tfunc(𝕃, ::typeof(Intrinsics.iota), @nospecialize(shape_arg), @nospecialize(type_arg_lat)) isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val - type_arg = CC.widenconst(argtypes[3]) + type_arg = CC.widenconst(type_arg_lat) type_arg <: Type || return nothing T = type_arg.parameters[1] return Tile{T, Tuple{shape...}} @@ -324,7 +315,7 @@ end # cuda_tile.mmaf, cuda_tile.mmai @intrinsic mma(a, b, acc) -tfunc(::typeof(Intrinsics.mma), argtypes::Vector{Any}) = CC.widenconst(argtypes[4]) +tfunc(𝕃, ::typeof(Intrinsics.mma), @nospecialize(a), @nospecialize(b), @nospecialize(acc)) = CC.widenconst(acc) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args) cb = ctx.cb @@ -343,11 +334,10 @@ end # cuda_tile.offset @intrinsic offset(base, offsets) -function tfunc(::typeof(Intrinsics.offset), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - base_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.offset), @nospecialize(base), @nospecialize(offsets)) + base_type = CC.widenconst(base) base_type <: Ptr || return nothing - offsets_type = CC.widenconst(argtypes[3]) + offsets_type = CC.widenconst(offsets) offsets_type <: Tile || return nothing T = eltype(base_type) S = offsets_type.parameters[2] @@ -397,11 +387,9 @@ end # cuda_tile.permute @intrinsic permute(tile, perm) -function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.permute), @nospecialize(tile_lat), @nospecialize(perm_arg)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing - perm_arg = argtypes[3] isa(perm_arg, CC.Const) || return nothing perm = perm_arg.val s = size(tile_type) @@ -447,9 +435,8 @@ end # cuda_tile.transpose @intrinsic transpose(tile) -function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.transpose), @nospecialize(tile_lat)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing s = size(tile_type) isempty(s) && return nothing @@ -484,11 +471,9 @@ end # cuda_tile.reduce @intrinsic reduce(tiles, axis, f, identities) -function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tuple_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.reduce), @nospecialize(tiles), @nospecialize(axis_arg), @nospecialize args...) + tuple_type = CC.widenconst(tiles) tuple_type isa DataType && tuple_type <: Tuple || return nothing - axis_arg = argtypes[3] isa(axis_arg, CC.Const) || return nothing axis = axis_arg.val result_params = Any[] @@ -615,11 +600,9 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer = # cuda_tile.reshape @intrinsic reshape(tile, shape) -function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.reshape), @nospecialize(tile_lat), @nospecialize(shape_arg)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing - shape_arg = argtypes[3] isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tile_type) @@ -684,9 +667,8 @@ end # cuda_tile.scan @intrinsic scan(tiles, axis, f, identities, reverse=false) -function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - tuple_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.scan), @nospecialize(tiles), @nospecialize args...) + tuple_type = CC.widenconst(tiles) tuple_type isa DataType && tuple_type <: Tuple || return nothing result_params = Any[] for p in tuple_type.parameters @@ -784,10 +766,8 @@ end # cuda_tile.select @intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y) @intrinsic select(cond::Tile, x, y) -function tfunc(::typeof(Intrinsics.select), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - cond_type = CC.widenconst(argtypes[2]) - cond_type <: Tile ? CC.widenconst(argtypes[3]) : nothing +function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y)) + CC.widenconst(x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args) cb = ctx.cb @@ -810,17 +790,15 @@ end # from_scalar: restores jltype to Tile{T, S}. @intrinsic to_scalar(tile) @intrinsic from_scalar(x, S) -function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - T = CC.widenconst(argtypes[2]) - shape_type = CC.widenconst(argtypes[3]) +function tfunc(𝕃, ::typeof(Intrinsics.from_scalar), @nospecialize(x), @nospecialize(S_lat)) + T = CC.widenconst(x) + shape_type = CC.widenconst(S_lat) shape_type <: Type || return nothing S = shape_type.parameters[1] return Tile{T, S} end -function tfunc(::typeof(Intrinsics.to_scalar), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.to_scalar), @nospecialize(tile_lat)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing return eltype(tile_type) end diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl index 8bd3e93..1a35010 100644 --- a/src/compiler/intrinsics/math.jl +++ b/src/compiler/intrinsics/math.jl @@ -4,28 +4,28 @@ # cuda_tile.ceil @intrinsic ceil(x) -tfunc(::typeof(Intrinsics.ceil), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.ceil), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args) emit_unop!(ctx, args, encode_CeilOp!) end # cuda_tile.cos @intrinsic cos(x) -tfunc(::typeof(Intrinsics.cos), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.cos), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args) emit_unop!(ctx, args, encode_CosOp!) end # cuda_tile.cosh @intrinsic cosh(x) -tfunc(::typeof(Intrinsics.cosh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.cosh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args) emit_unop!(ctx, args, encode_CosHOp!) end # cuda_tile.exp2 @intrinsic exp2(x, flush_to_zero=false) -tfunc(::typeof(Intrinsics.exp2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.exp2), @nospecialize(x), @nospecialize args...) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) cb = ctx.cb @@ -41,7 +41,7 @@ end # cuda_tile.exp @intrinsic exp(x) -tfunc(::typeof(Intrinsics.exp), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.exp), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) cb = ctx.cb @@ -55,14 +55,14 @@ end # cuda_tile.floor @intrinsic floor(x) -tfunc(::typeof(Intrinsics.floor), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.floor), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args) emit_unop!(ctx, args, encode_FloorOp!) end # cuda_tile.fma @intrinsic fma(x, y, z) -tfunc(::typeof(Intrinsics.fma), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.fma), @nospecialize(x), @nospecialize(y), @nospecialize(z)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) cb = ctx.cb @@ -79,7 +79,7 @@ end # cuda_tile.log2 @intrinsic log2(x) -tfunc(::typeof(Intrinsics.log2), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.log2), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) cb = ctx.cb @@ -93,7 +93,7 @@ end # cuda_tile.log @intrinsic log(x) -tfunc(::typeof(Intrinsics.log), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.log), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) cb = ctx.cb @@ -107,35 +107,35 @@ end # cuda_tile.maxf @intrinsic maxf(x, y) -tfunc(::typeof(Intrinsics.maxf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.maxf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args) emit_binop!(ctx, args, encode_MaxFOp!) end # cuda_tile.minf @intrinsic minf(x, y) -tfunc(::typeof(Intrinsics.minf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.minf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args) emit_binop!(ctx, args, encode_MinFOp!) end # cuda_tile.pow @intrinsic pow(x, y) -tfunc(::typeof(Intrinsics.pow), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.pow), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args) emit_binop!(ctx, args, encode_PowOp!) end # cuda_tile.remf @intrinsic remf(x, y) -tfunc(::typeof(Intrinsics.remf), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.remf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args) emit_binop!(ctx, args, encode_RemFOp!) end # cuda_tile.rsqrt @intrinsic rsqrt(x, flush_to_zero=false) -tfunc(::typeof(Intrinsics.rsqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.rsqrt), @nospecialize(x), @nospecialize args...) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) cb = ctx.cb @@ -151,21 +151,21 @@ end # cuda_tile.sin @intrinsic sin(x) -tfunc(::typeof(Intrinsics.sin), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.sin), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args) emit_unop!(ctx, args, encode_SinOp!) end # cuda_tile.sinh @intrinsic sinh(x) -tfunc(::typeof(Intrinsics.sinh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.sinh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args) emit_unop!(ctx, args, encode_SinHOp!) end # cuda_tile.sqrt @intrinsic sqrt(x) -tfunc(::typeof(Intrinsics.sqrt), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.sqrt), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) cb = ctx.cb @@ -179,14 +179,14 @@ end # cuda_tile.tan @intrinsic tan(x) -tfunc(::typeof(Intrinsics.tan), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.tan), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args) emit_unop!(ctx, args, encode_TanOp!) end # cuda_tile.tanh @intrinsic tanh(x) -tfunc(::typeof(Intrinsics.tanh), argtypes::Vector{Any}) = CC.widenconst(argtypes[2]) +tfunc(𝕃, ::typeof(Intrinsics.tanh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args) emit_unop!(ctx, args, encode_TanHOp!) end diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl index f7bf9e5..d4d4f87 100644 --- a/src/compiler/intrinsics/memory.jl +++ b/src/compiler/intrinsics/memory.jl @@ -4,9 +4,8 @@ # cuda_tile.load_ptr_tko @intrinsic load_ptr_tko(ptrs, latency=nothing, mask=nothing, padding=nothing) -function tfunc(::typeof(Intrinsics.load_ptr_tko), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - ptrs_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.load_ptr_tko), @nospecialize(ptrs), @nospecialize args...) + ptrs_type = CC.widenconst(ptrs) ptrs_type <: Tile || return nothing ptr_type = eltype(ptrs_type) ptr_type <: Ptr || return nothing @@ -74,11 +73,10 @@ end # TODO: cuda_tile.make_token # cuda_tile.store_ptr_tko -@intrinsic function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, - latency::Union{Int, Nothing}, - mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} - nothing -end +@intrinsic store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, + latency::Union{Int, Nothing}, + mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} +tfunc(𝕃, ::typeof(Intrinsics.store_ptr_tko), @nospecialize args...) = Nothing efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args) diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl index 2a0a784..fa1c4ba 100644 --- a/src/compiler/intrinsics/misc.jl +++ b/src/compiler/intrinsics/misc.jl @@ -1,9 +1,8 @@ # miscellaneous intrinsics # cuda_tile.assert -@intrinsic function assert(cond::Bool, message::String) - nothing -end +@intrinsic assert(cond::Bool, message::String) +tfunc(𝕃, ::typeof(Intrinsics.assert), @nospecialize(cond), @nospecialize(message)) = Nothing efunc(::typeof(Intrinsics.assert), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.assert), args) diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index fd1bdde..fff19b1 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -25,7 +25,7 @@ end # cuda_tile.get_index_space_shape @intrinsic get_index_space_shape(pv, axis) -tfunc(::typeof(Intrinsics.get_index_space_shape), argtypes::Vector{Any}) = Int32 +tfunc(𝕃, ::typeof(Intrinsics.get_index_space_shape), @nospecialize(pv), @nospecialize(axis)) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_index_space_shape), args) cb = ctx.cb tt = ctx.tt @@ -61,9 +61,8 @@ end # cuda_tile.load_view_tko @intrinsic load_partition_view(pv, latency, allow_tma, indices) -function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - pv_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.load_partition_view), @nospecialize(pv), @nospecialize args...) + pv_type = CC.widenconst(pv) pv_type <: PartitionView || return nothing pv_type isa DataType || return nothing length(pv_type.parameters) >= 3 || return nothing @@ -151,11 +150,9 @@ end # cuda_tile.make_partition_view @intrinsic make_partition_view(tv, shape, padding_mode, order) -function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tv_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.make_partition_view), @nospecialize(tv), @nospecialize(shape_arg), @nospecialize args...) + tv_type = CC.widenconst(tv) tv_type <: TensorView || return nothing - shape_arg = argtypes[3] isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tv_type) @@ -302,8 +299,11 @@ function filter_dynamic_strides(stride_vals::Vector{Value}, tv_strides::Vector{I end # cuda_tile.make_tensor_view -@intrinsic function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N} - TensorView{T, N}() +@intrinsic make_tensor_view(arr::TileArray{T, N}) where {T, N} +function tfunc(𝕃, ::typeof(Intrinsics.make_tensor_view), @nospecialize(arr)) + t = CC.widenconst(arr) + t <: TileArray || return nothing + TensorView{eltype(t), ndims(t)} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args) array_arg = args[1] @@ -324,13 +324,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args end # cuda_tile.store_view_tko -@intrinsic function store_partition_view(pv::PartitionView{T, N, Shape}, - tile::Tile{T}, - latency::Union{Int, Nothing}, - allow_tma::Bool, - indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - nothing -end +@intrinsic store_partition_view(pv::PartitionView{T, N, Shape}, + tile::Tile{T}, + latency::Union{Int, Nothing}, + allow_tma::Bool, + indices::NTuple{M, <:Integer}) where {T, N, Shape, M} +tfunc(𝕃, ::typeof(Intrinsics.store_partition_view), @nospecialize args...) = Nothing efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), args) From 6979d0bb221c435001993a8647cdcc36616cd1d0 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Feb 2026 12:15:22 +0100 Subject: [PATCH 06/17] Restrict intrinsics. --- src/compiler/intrinsics/arithmetic.jl | 48 +++++++++++----------- src/compiler/intrinsics/core.jl | 4 +- src/compiler/intrinsics/math.jl | 57 ++++++++++++++++++--------- 3 files changed, 65 insertions(+), 44 deletions(-) diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 05a0a2c..79c33c9 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -100,7 +100,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args) end # cuda_tile.cldi (ceiling division, toward positive infinity) -@intrinsic cldi(x, y, s) +@intrinsic cldi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic cldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.cldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness")) @@ -123,7 +124,7 @@ end Core.Intrinsics.ne_int(x, y) end end -@intrinsic cmpi(a::Tile, b::Tile, pred, s) +@intrinsic cmpi(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s)) t = CC.widenconst(x) if t <: Tile @@ -159,7 +160,7 @@ end @intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer} s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y) end -@intrinsic divi(a::Tile, b::Tile, s) +@intrinsic divi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness")) @@ -167,7 +168,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) end # cuda_tile.fldi (floor division, toward negative infinity) -@intrinsic fldi(x, y, s) +@intrinsic fldi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic fldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.fldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness")) @@ -179,7 +181,7 @@ end lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) ifelse(lt, y, x) end -@intrinsic maxi(a::Tile, b::Tile, s) +@intrinsic maxi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness")) @@ -188,7 +190,7 @@ end # cuda_tile.mini @intrinsic mini(x::T, y::T, s::Signedness) where {T<:Integer} -@intrinsic mini(a::Tile, b::Tile, s) +@intrinsic mini(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.mini), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness")) @@ -197,7 +199,7 @@ end # cuda_tile.muli @intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) -@intrinsic muli(a::Tile, b::Tile) +@intrinsic muli(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args) emit_binop!(ctx, args, encode_MulIOp!) @@ -205,7 +207,7 @@ end # cuda_tile.mulhii @intrinsic mulhii(x::T, y::T, s::Signedness) where {T<:Integer} -@intrinsic mulhii(a::Tile, b::Tile, s) +@intrinsic mulhii(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.mulhii), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args) emit_binop!(ctx, args, encode_MulhiIOp!) @@ -213,7 +215,7 @@ end # cuda_tile.negi @intrinsic negi(x::T) where {T<:Integer} -@intrinsic negi(a::Tile) +@intrinsic negi(a::Tile{<:Integer}) tfunc(𝕃, ::typeof(Intrinsics.negi), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args) emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone) @@ -221,7 +223,7 @@ end # cuda_tile.remi @intrinsic remi(x::T, y::T, s::Signedness) where {T<:Integer} -@intrinsic remi(a::Tile, b::Tile, s) +@intrinsic remi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.remi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness")) @@ -230,7 +232,7 @@ end # cuda_tile.shli @intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T) -@intrinsic shli(a::Tile, b::Tile) +@intrinsic shli(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args) emit_binop!(ctx, args, encode_ShLIOp!) @@ -238,7 +240,7 @@ end # cuda_tile.shri @intrinsic shri(x::T, y::Integer, s::Signedness) where {T<:Integer} -@intrinsic shri(a::Tile, b::Tile, s) +@intrinsic shri(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.shri), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness")) @@ -247,7 +249,7 @@ end # cuda_tile.subi @intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) -@intrinsic subi(a::Tile, b::Tile) +@intrinsic subi(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args) emit_binop!(ctx, args, encode_SubIOp!) @@ -258,7 +260,7 @@ end # cuda_tile.absf @intrinsic absf(x::T) where {T<:AbstractFloat} -@intrinsic absf(a::Tile) +@intrinsic absf(a::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.absf), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args) emit_unop!(ctx, args, encode_AbsFOp!) @@ -266,7 +268,7 @@ end # cuda_tile.addf @intrinsic addf(x::T, y::T) where {T<:AbstractFloat} -@intrinsic addf(a::Tile, b::Tile) +@intrinsic addf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.addf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args) emit_binop!(ctx, args, encode_AddFOp!) @@ -274,7 +276,7 @@ end # cuda_tile.cmpf @intrinsic cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} -@intrinsic cmpf(a::Tile, b::Tile, pred) +@intrinsic cmpf(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate) where {T<:AbstractFloat} function tfunc(𝕃, ::typeof(Intrinsics.cmpf), @nospecialize(x), @nospecialize(y), @nospecialize(pred)) t = CC.widenconst(x) if t <: Tile @@ -307,7 +309,7 @@ end # cuda_tile.divf @intrinsic divf(x::T, y::T) where {T<:AbstractFloat} -@intrinsic divf(a::Tile, b::Tile) +@intrinsic divf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.divf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args) emit_binop!(ctx, args, encode_DivFOp!) @@ -315,7 +317,7 @@ end # cuda_tile.mulf @intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} -@intrinsic mulf(a::Tile, b::Tile) +@intrinsic mulf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.mulf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args) emit_binop!(ctx, args, encode_MulFOp!) @@ -323,7 +325,7 @@ end # cuda_tile.negf @intrinsic negf(x::T) where {T<:AbstractFloat} -@intrinsic negf(a::Tile) +@intrinsic negf(a::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.negf), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args) emit_unop!(ctx, args, encode_NegFOp!) @@ -331,7 +333,7 @@ end # cuda_tile.subf @intrinsic subf(x::T, y::T) where {T<:AbstractFloat} -@intrinsic subf(a::Tile, b::Tile) +@intrinsic subf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.subf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args) emit_binop!(ctx, args, encode_SubFOp!) @@ -342,7 +344,7 @@ end # cuda_tile.andi @intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) -@intrinsic andi(a::Tile, b::Tile) +@intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) cb = ctx.cb @@ -361,7 +363,7 @@ end # cuda_tile.ori @intrinsic ori(x::T, y::T) where {T<:Integer} -@intrinsic ori(a::Tile, b::Tile) +@intrinsic ori(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) cb = ctx.cb @@ -380,7 +382,7 @@ end # cuda_tile.xori @intrinsic xori(x::T, y::T) where {T<:Integer} -@intrinsic xori(a::Tile, b::Tile) +@intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args) cb = ctx.cb diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index cfbba8a..53b949c 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -314,7 +314,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args) end # cuda_tile.mmaf, cuda_tile.mmai -@intrinsic mma(a, b, acc) +@intrinsic mma(a::Tile, b::Tile, acc::Tile) tfunc(𝕃, ::typeof(Intrinsics.mma), @nospecialize(a), @nospecialize(b), @nospecialize(acc)) = CC.widenconst(acc) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args) cb = ctx.cb @@ -765,7 +765,7 @@ end # cuda_tile.select @intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y) -@intrinsic select(cond::Tile, x, y) +@intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T} function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y)) CC.widenconst(x) end diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl index 1a35010..519cf1e 100644 --- a/src/compiler/intrinsics/math.jl +++ b/src/compiler/intrinsics/math.jl @@ -3,28 +3,32 @@ ## Floating-point math # cuda_tile.ceil -@intrinsic ceil(x) +@intrinsic ceil(x::AbstractFloat) +@intrinsic ceil(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.ceil), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args) emit_unop!(ctx, args, encode_CeilOp!) end # cuda_tile.cos -@intrinsic cos(x) +@intrinsic cos(x::AbstractFloat) +@intrinsic cos(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.cos), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args) emit_unop!(ctx, args, encode_CosOp!) end # cuda_tile.cosh -@intrinsic cosh(x) +@intrinsic cosh(x::AbstractFloat) +@intrinsic cosh(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.cosh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args) emit_unop!(ctx, args, encode_CosHOp!) end # cuda_tile.exp2 -@intrinsic exp2(x, flush_to_zero=false) +@intrinsic exp2(x::AbstractFloat, flush_to_zero::Bool=false) +@intrinsic exp2(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false) tfunc(𝕃, ::typeof(Intrinsics.exp2), @nospecialize(x), @nospecialize args...) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) cb = ctx.cb @@ -40,7 +44,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) end # cuda_tile.exp -@intrinsic exp(x) +@intrinsic exp(x::AbstractFloat) +@intrinsic exp(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.exp), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) cb = ctx.cb @@ -54,14 +59,16 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) end # cuda_tile.floor -@intrinsic floor(x) +@intrinsic floor(x::AbstractFloat) +@intrinsic floor(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.floor), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args) emit_unop!(ctx, args, encode_FloorOp!) end # cuda_tile.fma -@intrinsic fma(x, y, z) +@intrinsic fma(x::T, y::T, z::T) where {T<:AbstractFloat} +@intrinsic fma(x::Tile{T}, y::Tile{T}, z::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.fma), @nospecialize(x), @nospecialize(y), @nospecialize(z)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) cb = ctx.cb @@ -78,7 +85,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) end # cuda_tile.log2 -@intrinsic log2(x) +@intrinsic log2(x::AbstractFloat) +@intrinsic log2(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.log2), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) cb = ctx.cb @@ -92,7 +100,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) end # cuda_tile.log -@intrinsic log(x) +@intrinsic log(x::AbstractFloat) +@intrinsic log(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.log), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) cb = ctx.cb @@ -106,35 +115,40 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) end # cuda_tile.maxf -@intrinsic maxf(x, y) +@intrinsic maxf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic maxf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.maxf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args) emit_binop!(ctx, args, encode_MaxFOp!) end # cuda_tile.minf -@intrinsic minf(x, y) +@intrinsic minf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic minf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.minf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args) emit_binop!(ctx, args, encode_MinFOp!) end # cuda_tile.pow -@intrinsic pow(x, y) +@intrinsic pow(x::T, y::T) where {T<:AbstractFloat} +@intrinsic pow(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.pow), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args) emit_binop!(ctx, args, encode_PowOp!) end # cuda_tile.remf -@intrinsic remf(x, y) +@intrinsic remf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic remf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} tfunc(𝕃, ::typeof(Intrinsics.remf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args) emit_binop!(ctx, args, encode_RemFOp!) end # cuda_tile.rsqrt -@intrinsic rsqrt(x, flush_to_zero=false) +@intrinsic rsqrt(x::AbstractFloat, flush_to_zero::Bool=false) +@intrinsic rsqrt(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false) tfunc(𝕃, ::typeof(Intrinsics.rsqrt), @nospecialize(x), @nospecialize args...) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) cb = ctx.cb @@ -150,21 +164,24 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) end # cuda_tile.sin -@intrinsic sin(x) +@intrinsic sin(x::AbstractFloat) +@intrinsic sin(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.sin), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args) emit_unop!(ctx, args, encode_SinOp!) end # cuda_tile.sinh -@intrinsic sinh(x) +@intrinsic sinh(x::AbstractFloat) +@intrinsic sinh(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.sinh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args) emit_unop!(ctx, args, encode_SinHOp!) end # cuda_tile.sqrt -@intrinsic sqrt(x) +@intrinsic sqrt(x::AbstractFloat) +@intrinsic sqrt(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.sqrt), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) cb = ctx.cb @@ -178,14 +195,16 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) end # cuda_tile.tan -@intrinsic tan(x) +@intrinsic tan(x::AbstractFloat) +@intrinsic tan(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.tan), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args) emit_unop!(ctx, args, encode_TanOp!) end # cuda_tile.tanh -@intrinsic tanh(x) +@intrinsic tanh(x::AbstractFloat) +@intrinsic tanh(x::Tile{<:AbstractFloat}) tfunc(𝕃, ::typeof(Intrinsics.tanh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args) emit_unop!(ctx, args, encode_TanHOp!) From ac32212fa4a9fbdf2e1ce713972b0d0bd8ff5463 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Feb 2026 21:40:22 +0100 Subject: [PATCH 07/17] Get ifelse working by avoiding the error. --- src/compiler/intrinsics.jl | 3 --- src/compiler/intrinsics/core.jl | 5 ++++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index df0fc48..37c4b4b 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -41,9 +41,6 @@ macro intrinsic(ex) combinedef(splitdef(ex)) else body = quote - if inferencebarrier(true)::Bool - error("Intrinsic $(string(ex)) cannot be evaluated at compile time") - end compilerbarrier(:type, nothing) end Expr(:function, ex, body) diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 53b949c..346ea0d 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -764,9 +764,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) end # cuda_tile.select -@intrinsic select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y) +@intrinsic select(cond::Bool, x::T, y::T) where {T}# = Core.ifelse(cond, x, y) @intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T} function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y)) + if cond isa CC.Const + return cond.val === true ? x : y + end CC.widenconst(x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args) From 08b208fbebe4354a106f03e913db7bba54c20338 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 10 Feb 2026 21:59:59 +0100 Subject: [PATCH 08/17] Remove more intrinsic bodies. --- src/compiler/intrinsics.jl | 10 +++----- src/compiler/intrinsics/arithmetic.jl | 33 ++++++-------------------- src/compiler/intrinsics/conversions.jl | 4 +--- 3 files changed, 11 insertions(+), 36 deletions(-) diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 37c4b4b..ab6d02c 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -37,14 +37,10 @@ provide a correct scalar implementation using `Core.Intrinsics`, or return `nothing` for side-effect-only intrinsics. """ macro intrinsic(ex) - funcdef = if ex isa Expr && ex.head in (:function, :(=)) - combinedef(splitdef(ex)) - else - body = quote - compilerbarrier(:type, nothing) - end - Expr(:function, ex, body) + body = quote + compilerbarrier(:type, nothing) end + funcdef = Expr(:function, ex, body) funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef) return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef))))) end diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 79c33c9..9af46d0 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -109,21 +109,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args) end # cuda_tile.cmpi -@intrinsic function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} - if pred === CmpLessThan - s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - elseif pred === CmpLessThanOrEqual - s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y) - elseif pred === CmpGreaterThan - s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x) - elseif pred === CmpGreaterThanOrEqual - s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x) - elseif pred === CmpEqual - Core.Intrinsics.eq_int(x, y) - else # CmpNotEqual - Core.Intrinsics.ne_int(x, y) - end -end +@intrinsic cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} @intrinsic cmpi(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s)) t = CC.widenconst(x) @@ -157,9 +143,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) end # cuda_tile.divi (truncating division, toward zero) -@intrinsic function divi(x::T, y::T, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y) -end +@intrinsic divi(x::T, y::T, s::Signedness) where {T<:Integer} @intrinsic divi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) @@ -177,10 +161,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args) end # cuda_tile.maxi -@intrinsic function maxi(x::T, y::T, s::Signedness) where {T<:Integer} - lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - ifelse(lt, y, x) -end +@intrinsic maxi(x::T, y::T, s::Signedness) where {T<:Integer} @intrinsic maxi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) @@ -198,7 +179,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) end # cuda_tile.muli -@intrinsic muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) +@intrinsic muli(x::T, y::T) where {T<:Integer} @intrinsic muli(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args) @@ -231,7 +212,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args) end # cuda_tile.shli -@intrinsic shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T) +@intrinsic shli(x::T, y::Integer) where {T<:Integer} @intrinsic shli(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args) @@ -248,7 +229,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args) end # cuda_tile.subi -@intrinsic subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) +@intrinsic subi(x::T, y::T) where {T<:Integer} @intrinsic subi(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args) @@ -343,7 +324,7 @@ end ## Boolean arithmetic # cuda_tile.andi -@intrinsic andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) +@intrinsic andi(x::T, y::T) where {T<:Integer} @intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer} tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 638b05d..409bff1 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -3,9 +3,7 @@ # TODO: cuda_tile.bitcast # cuda_tile.exti (scalar integer extension) -@intrinsic function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} - s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x) -end +@intrinsic exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) tgt = CC.widenconst(target_type) T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing From ca66ae76a95cdd8101586588efa7483308b50326 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 09:09:22 +0100 Subject: [PATCH 09/17] Remove ExprTools. --- Project.toml | 12 +++++------- src/compiler/intrinsics.jl | 2 -- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/Project.toml b/Project.toml index cdff353..5323854 100644 --- a/Project.toml +++ b/Project.toml @@ -8,10 +8,9 @@ projects = ["test", "examples"] [deps] BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8" CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d" -ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93" [weakdeps] @@ -19,18 +18,17 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c" [sources] -CompilerCaching = {url = "https://github.com/maleadt/CompilerCaching.jl", rev="main"} -IRStructurizer = {url = "https://github.com/maleadt/IRStructurizer.jl", rev = "main"} +CompilerCaching = {rev = "main", url = "https://github.com/maleadt/CompilerCaching.jl"} +IRStructurizer = {rev = "main", url = "https://github.com/maleadt/IRStructurizer.jl"} [extensions] CUDAExt = "CUDA" DLFP8TypesExt = "DLFP8Types" [compat] -julia = "1.11" BFloat16s = "0.6" -CompilerCaching = "0.1" CUDA_Compiler_jll = "0.4" CUDA_Tile_jll = "13.1" -ExprTools = "0.1" +CompilerCaching = "0.1" IRStructurizer = "0.1" +julia = "1.11" diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index ab6d02c..56cce9b 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -17,8 +17,6 @@ end # Const-prop for overlay callers happens via @assume_effects :foldable at the # overlay level, not through intrinsic bodies. -using ExprTools: splitdef, combinedef - """ @intrinsic signature @intrinsic function_definition From 5a686d898542d59c501dea58c390bd71d1832134 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 09:46:45 +0100 Subject: [PATCH 10/17] Port tfunc improvements from Julia. --- src/compiler/intrinsics.jl | 18 ++++++++++++++++- src/compiler/intrinsics/arithmetic.jl | 18 +++++++++++++++-- src/compiler/intrinsics/conversions.jl | 20 +++++++++---------- src/compiler/intrinsics/core.jl | 27 ++++++++++++++------------ 4 files changed, 58 insertions(+), 25 deletions(-) diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 56cce9b..c1d27aa 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -43,7 +43,21 @@ macro intrinsic(ex) return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef))))) end -emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing +""" + instanceof_tfunc(lat) -> Type or nothing + +Extract `T` from a lattice element representing `Type{T}`. +Simplified version of `Base.Compiler.instanceof_tfunc` that handles `Const(T)` +and `Type{T}` lattice elements. Returns `nothing` when `T` cannot be determined. +""" +function instanceof_tfunc(@nospecialize(lat)) + if isa(lat, CC.Const) + val = lat.val + return val isa Type ? val : nothing + end + tgt = CC.widenconst(lat) + return tgt isa DataType && tgt <: Type && !isempty(tgt.parameters) ? tgt.parameters[1] : nothing +end # Shared helper for creating load/store optimization hints function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, allow_tma::Bool=true) @@ -53,6 +67,8 @@ function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, all return make_load_store_hints(ctx.sm_arch, hints) end +emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing + include("intrinsics/core.jl") include("intrinsics/conversions.jl") include("intrinsics/arithmetic.jl") diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 9af46d0..3aba1c6 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -326,7 +326,14 @@ end # cuda_tile.andi @intrinsic andi(x::T, y::T) where {T<:Integer} @intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer} -tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) +function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) + if isa(x, CC.Const) && x.val === false && CC.widenconst(y) === Bool + return CC.Const(false) + elseif isa(y, CC.Const) && y.val === false && CC.widenconst(x) === Bool + return CC.Const(false) + end + return CC.widenconst(x) +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) cb = ctx.cb tt = ctx.tt @@ -345,7 +352,14 @@ end # cuda_tile.ori @intrinsic ori(x::T, y::T) where {T<:Integer} @intrinsic ori(a::Tile{T}, b::Tile{T}) where {T<:Integer} -tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) +function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) + if isa(x, CC.Const) && x.val === true && CC.widenconst(y) === Bool + return CC.Const(true) + elseif isa(y, CC.Const) && y.val === true && CC.widenconst(x) === Bool + return CC.Const(true) + end + return CC.widenconst(x) +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 409bff1..e302063 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -5,8 +5,8 @@ # cuda_tile.exti (scalar integer extension) @intrinsic exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) - tgt = CC.widenconst(target_type) - T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + T = instanceof_tfunc(target_type) + T === nothing && return nothing src = CC.widenconst(x) src <: Tile ? similar_type(src, T) : T end @@ -30,8 +30,8 @@ end # cuda_tile.ftof (scalar float to float) @intrinsic ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} function tfunc(𝕃, ::typeof(Intrinsics.ftof), @nospecialize(x), @nospecialize(target_type)) - tgt = CC.widenconst(target_type) - T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + T = instanceof_tfunc(target_type) + T === nothing && return nothing src = CC.widenconst(x) src <: Tile ? similar_type(src, T) : T end @@ -54,8 +54,8 @@ end # cuda_tile.ftoi (scalar float to integer) @intrinsic ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} function tfunc(𝕃, ::typeof(Intrinsics.ftoi), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) - tgt = CC.widenconst(target_type) - T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + T = instanceof_tfunc(target_type) + T === nothing && return nothing src = CC.widenconst(x) src <: Tile ? similar_type(src, T) : T end @@ -79,8 +79,8 @@ end # cuda_tile.itof (scalar integer to float) @intrinsic itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} function tfunc(𝕃, ::typeof(Intrinsics.itof), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) - tgt = CC.widenconst(target_type) - T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + T = instanceof_tfunc(target_type) + T === nothing && return nothing src = CC.widenconst(x) src <: Tile ? similar_type(src, T) : T end @@ -104,8 +104,8 @@ end # cuda_tile.trunci (scalar integer truncation) @intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} function tfunc(𝕃, ::typeof(Intrinsics.trunci), @nospecialize(x), @nospecialize(target_type)) - tgt = CC.widenconst(target_type) - T = tgt isa DataType && tgt <: Type ? tgt.parameters[1] : return nothing + T = instanceof_tfunc(target_type) + T === nothing && return nothing src = CC.widenconst(x) src <: Tile ? similar_type(src, T) : T end diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 346ea0d..306d13a 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -167,9 +167,8 @@ end function tfunc(𝕃, ::typeof(Intrinsics.constant), @nospecialize(shape_arg), @nospecialize(value), @nospecialize(type_arg_lat)) isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val - type_arg = CC.widenconst(type_arg_lat) - type_arg <: Type || return nothing - T = type_arg.parameters[1] + T = instanceof_tfunc(type_arg_lat) + T === nothing && return nothing return Tile{T, Tuple{shape...}} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.constant), args) @@ -286,9 +285,8 @@ end function tfunc(𝕃, ::typeof(Intrinsics.iota), @nospecialize(shape_arg), @nospecialize(type_arg_lat)) isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val - type_arg = CC.widenconst(type_arg_lat) - type_arg <: Type || return nothing - T = type_arg.parameters[1] + T = instanceof_tfunc(type_arg_lat) + T === nothing && return nothing return Tile{T, Tuple{shape...}} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args) @@ -767,10 +765,16 @@ end @intrinsic select(cond::Bool, x::T, y::T) where {T}# = Core.ifelse(cond, x, y) @intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T} function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y)) - if cond isa CC.Const - return cond.val === true ? x : y + if isa(cond, CC.Const) + if cond.val === true + return x + elseif cond.val === false + return y + else + return Union{} + end end - CC.widenconst(x) + return CC.tmerge(𝕃, x, y) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args) cb = ctx.cb @@ -795,9 +799,8 @@ end @intrinsic from_scalar(x, S) function tfunc(𝕃, ::typeof(Intrinsics.from_scalar), @nospecialize(x), @nospecialize(S_lat)) T = CC.widenconst(x) - shape_type = CC.widenconst(S_lat) - shape_type <: Type || return nothing - S = shape_type.parameters[1] + S = instanceof_tfunc(S_lat) + S === nothing && return nothing return Tile{T, S} end function tfunc(𝕃, ::typeof(Intrinsics.to_scalar), @nospecialize(tile_lat)) From 335dd1404761d215c59c657a46f7ef019fe59dbb Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 09:59:18 +0100 Subject: [PATCH 11/17] Fix docstring. --- src/compiler/intrinsics.jl | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index c1d27aa..5275036 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -12,27 +12,12 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal end -# NOTE: Intrinsics use bare signatures with dummy bodies (compilerbarrier(:type, nothing)). -# Return types are provided by tfunc overrides in the interpreter. -# Const-prop for overlay callers happens via @assume_effects :foldable at the -# overlay level, not through intrinsic bodies. - """ @intrinsic signature - @intrinsic function_definition - -Define a Tile IR intrinsic in the `Intrinsics` module. - -A bare signature (e.g. `@intrinsic foo(x)`) creates a dummy body using -`compilerbarrier(:type, nothing)` so body inference returns `Any`. Actual -return types come from `tfunc` overrides in the interpreter. -A function definition (e.g. `@intrinsic foo(x) = expr`) preserves the body, -providing a callable implementation for concrete evaluation. This is needed -when overlay callers with `@assume_effects :foldable` cause the compiler to -evaluate through intrinsic bodies (JuliaLang/julia#60583). The body should -provide a correct scalar implementation using `Core.Intrinsics`, or return -`nothing` for side-effect-only intrinsics. +Define a Tile IR intrinsic in the `Intrinsics` module. These intrinsics are +defined to return `Any`, so need additional `tfunc` and `efunc` definitions +to specify their behavior. """ macro intrinsic(ex) body = quote From 6a93f0c8b61dff60bc223c0edea4cfe29730305a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 10:17:28 +0100 Subject: [PATCH 12/17] Simplify. --- src/compiler/interface.jl | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index f69fc36..6935a75 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -192,11 +192,6 @@ end rt = rt_override !== nothing ? rt_override : cm.rt efunc_override = is_intr ? efunc(f, cm.effects) : nothing effects = efunc_override !== nothing ? efunc_override : cm.effects - # Mark intrinsics as non-consistently-overlayed so callers can't be - # concrete-eval'd (not_callable() bodies would throw at runtime). - if is_intr - effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE) - end info = is_intr ? CC.NoCallInfo() : cm.info info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) @@ -225,11 +220,6 @@ elseif isdefined(CC, :Future) # 1.12–1.13 rt = rt_override !== nothing ? rt_override : cm.rt efunc_override = is_intr ? efunc(f, cm.effects) : nothing effects = efunc_override !== nothing ? efunc_override : cm.effects - # Mark intrinsics as non-consistently-overlayed so callers can't be - # concrete-eval'd (not_callable() bodies would throw at runtime). - if is_intr - effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE) - end info = is_intr ? CC.NoCallInfo() : cm.info info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) @@ -251,11 +241,6 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges rt = rt_override !== nothing ? rt_override : result.rt efunc_override = is_intr ? efunc(f, result.effects) : nothing effects = efunc_override !== nothing ? efunc_override : result.effects - # Mark intrinsics as non-consistently-overlayed so callers can't be - # concrete-eval'd (not_callable() bodies would throw at runtime). - if is_intr - effects = CC.Effects(effects; nonoverlayed=CC.ALWAYS_FALSE) - end info = is_intr ? CC.NoCallInfo() : result.info if is_intr || rt_override !== nothing return CC.CallMeta(rt, result.exct, effects, info) From ca6e92342ab9c414d69021864469ab9d123b1ced Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 10:18:30 +0100 Subject: [PATCH 13/17] Use released versions of packages. --- Project.toml | 4 ---- test/Project.toml | 3 --- 2 files changed, 7 deletions(-) diff --git a/Project.toml b/Project.toml index 5323854..6f344ac 100644 --- a/Project.toml +++ b/Project.toml @@ -17,10 +17,6 @@ IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c" -[sources] -CompilerCaching = {rev = "main", url = "https://github.com/maleadt/CompilerCaching.jl"} -IRStructurizer = {rev = "main", url = "https://github.com/maleadt/IRStructurizer.jl"} - [extensions] CUDAExt = "CUDA" DLFP8TypesExt = "DLFP8Types" diff --git a/test/Project.toml b/test/Project.toml index 278b9d8..bd30c97 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -8,9 +8,6 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[sources] -FileCheck = {url = "https://github.com/JuliaLLVM/FileCheck.jl", rev = "main"} - [compat] FileCheck = "1.0" ParallelTestRunner = "2.0" From 09bb7e3ba8064ed870e4d4c5b04c7489a18b80ea Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 11:14:35 +0100 Subject: [PATCH 14/17] Handle literal SSA statements in emit_statement! When Julia's optimizer constant-folds an SSA statement (via concrete eval, SROA, constant propagation), it becomes a bare literal value instead of an Expr(:call, ...). The else-branch in emit_statement! previously just warned and discarded these, so the SSA slot was never registered in ctx.values, causing "SSAValue not found" crashes for any downstream reference. Delegate to emit_constant!/emit_value! for literal values, mirroring the existing pattern in emit_rhs!. Add regression test using crafted IR (the optimizer propagates constants too aggressively for a source-level repro). Co-Authored-By: Claude Opus 4.6 --- src/compiler/codegen/statements.jl | 9 +++- test/codegen/integration.jl | 68 ++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/compiler/codegen/statements.jl b/src/compiler/codegen/statements.jl index 13b8c60..d073c08 100644 --- a/src/compiler/codegen/statements.jl +++ b/src/compiler/codegen/statements.jl @@ -26,9 +26,14 @@ function emit_statement!(ctx::CGCtx, @nospecialize(stmt), ssa_idx::Int, @nospeci # PiNode is a type narrowing assertion - store the resolved value tv = emit_value!(ctx, stmt) elseif stmt === nothing - # No-op + # Dead code elimination artifact — no value to register else - @warn "Unhandled statement type" typeof(stmt) stmt + # Literal values from constant folding or concrete eval. + # Try emit_constant! first (numbers/ghost types), fall back to emit_value!. + tv = emit_constant!(ctx, stmt, result_type) + if tv === nothing + tv = emit_value!(ctx, stmt) + end end # Store result by original Julia SSA index diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl index a1503dc..91e1c20 100644 --- a/test/codegen/integration.jl +++ b/test/codegen/integration.jl @@ -1119,3 +1119,71 @@ end end end end + +#============================================================================= + Literal SSA Statement Handling +=============================================================================# + +@testset "Literal SSA statements" begin + # Regression test: when Julia's optimizer constant-folds an SSA statement + # (via concrete eval, SROA, constant propagation), the statement becomes a + # bare literal value instead of an Expr(:call, ...). emit_statement! must + # register a CGVal for these so downstream SSAValue references resolve. + # + # Strategy: compile a real kernel, replace one Expr with a literal in the + # IRCode (simulating constant folding), then verify codegen succeeds. + + spec = ct.ArraySpec{1}(16, true) + + function _literal_test_kernel(a::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(a, pid, tile) + return + end + + function _find_intrinsic_call(ir, callee) + for (i, inst) in enumerate(ir.stmts) + stmt = inst[:stmt] + if stmt isa Expr && stmt.head === :call && length(stmt.args) >= 1 + if stmt.args[1] === callee + return i + end + end + end + return nothing + end + + function _test_literal_ssa(value) + argtypes = Tuple{ct.TileArray{Float32,1,spec}} + world = Base.get_world_counter() + mi = something( + ct.method_instance(_literal_test_kernel, argtypes; world, + method_table=ct.cuTileMethodTable), + ct.method_instance(_literal_test_kernel, argtypes; world)) + ir, _ = ct.code_ircode(mi) + + # Replace first subi(pid, 1) with a literal — simulates constant folding + idx = _find_intrinsic_call(ir, ct.Intrinsics.subi) + @assert idx !== nothing "test setup: could not find subi call in IR" + ir.stmts[idx][:stmt] = value + + sci = ct.StructuredIRCode(ir) + bytecode = ct.write_bytecode!(1) do writer, func_buf + ct.emit_kernel!(writer, func_buf, sci, Nothing; + name="literal_test", + cache=ct.CacheView{ct.CuTileResults}( + (:cuTile, (sm_arch=nothing, opt_level=3, + num_ctas=nothing, occupancy=nothing)), world)) + end + return length(bytecode) > 0 + end + + @testset "Int32 zero literal" begin + @test _test_literal_ssa(Int32(0)) + end + + @testset "Int32 nonzero literal" begin + @test _test_literal_ssa(Int32(42)) + end +end From 8aa32eefa5f6f64915d67675ef076904ee70e4e1 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 11:28:37 +0100 Subject: [PATCH 15/17] Emit false constant for :boundscheck expressions in codegen Previously, emit_expr! returned nothing for Expr(:boundscheck), so the SSA slot was never registered. When the IR structurizer created an IfOp whose condition referenced that SSA, emit_value! crashed with "SSAValue not found". This happens when concrete_eval_eligible doesn't block semi-concrete eval, causing @boundscheck blocks from tuple indexing in the One() adapter to survive to codegen. Co-Authored-By: Claude Opus 4.6 --- src/compiler/codegen/expressions.jl | 4 +- test/codegen/integration.jl | 57 +++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl index 9abe616..61474c2 100644 --- a/src/compiler/codegen/expressions.jl +++ b/src/compiler/codegen/expressions.jl @@ -17,7 +17,9 @@ function emit_expr!(ctx::CGCtx, expr::Expr, @nospecialize(result_type)) elseif expr.head === :foreigncall throw(IRError("Foreign calls not supported in Tile IR")) elseif expr.head === :boundscheck - return nothing + # Bounds checking is always disabled in Tile IR kernels. + # Emit false so IfOps referencing this SSA can resolve the condition. + return emit_constant!(ctx, false, Bool) else @warn "Unhandled expression head" expr.head expr return nothing diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl index 91e1c20..546e6e9 100644 --- a/test/codegen/integration.jl +++ b/test/codegen/integration.jl @@ -1124,14 +1124,10 @@ end Literal SSA Statement Handling =============================================================================# -@testset "Literal SSA statements" begin - # Regression test: when Julia's optimizer constant-folds an SSA statement - # (via concrete eval, SROA, constant propagation), the statement becomes a - # bare literal value instead of an Expr(:call, ...). emit_statement! must - # register a CGVal for these so downstream SSAValue references resolve. - # - # Strategy: compile a real kernel, replace one Expr with a literal in the - # IRCode (simulating constant folding), then verify codegen succeeds. +@testset "Statement emission edge cases" begin + # Regression tests: certain IR statement forms (literal values from constant + # folding, :boundscheck expressions from inlined @boundscheck blocks) must + # register CGVals so downstream SSAValue references resolve. spec = ct.ArraySpec{1}(16, true) @@ -1186,4 +1182,49 @@ end @testset "Int32 nonzero literal" begin @test _test_literal_ssa(Int32(42)) end + + @testset "Expr(:boundscheck) registers CGVal" begin + # Regression test: Expr(:boundscheck) must emit a Bool constant so that + # downstream SSAValue references (e.g., IfOp conditions) can resolve. + # Previously, emit_expr! returned nothing for :boundscheck, leaving the + # SSA slot unregistered and causing "SSAValue not found" crashes. + # + # Strategy: inject Expr(:boundscheck) at the subi position and replace + # the downstream reference with a constant so codegen completes cleanly. + argtypes = Tuple{ct.TileArray{Float32,1,spec}} + world = Base.get_world_counter() + mi = something( + ct.method_instance(_literal_test_kernel, argtypes; world, + method_table=ct.cuTileMethodTable), + ct.method_instance(_literal_test_kernel, argtypes; world)) + ir, _ = ct.code_ircode(mi) + + # Replace first subi with Expr(:boundscheck) — simulates inlined @boundscheck + idx = _find_intrinsic_call(ir, ct.Intrinsics.subi) + @assert idx !== nothing "test setup: could not find subi call in IR" + ir.stmts[idx][:stmt] = Expr(:boundscheck) + ir.stmts[idx][:type] = Bool + # Fix downstream: replace the SSAValue reference to subi with a constant + # so the load_view doesn't fail on a Bool argument + for i in (idx+1):length(ir.stmts) + stmt = ir.stmts[i][:stmt] + if stmt isa Expr + for (j, arg) in enumerate(stmt.args) + if arg === Core.SSAValue(idx) + stmt.args[j] = Int32(0) + end + end + end + end + + sci = ct.StructuredIRCode(ir) + bytecode = ct.write_bytecode!(1) do writer, func_buf + ct.emit_kernel!(writer, func_buf, sci, Nothing; + name="boundscheck_test", + cache=ct.CacheView{ct.CuTileResults}( + (:cuTile, (sm_arch=nothing, opt_level=3, + num_ctas=nothing, occupancy=nothing)), world)) + end + @test length(bytecode) > 0 + end end From 60fa5bbfdb3ff6482b39025d5337b0e502b842d6 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 11:33:52 +0100 Subject: [PATCH 16/17] Re-enable concrete eval for intrinsics. --- src/compiler/interface.jl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index 6935a75..86f8f8b 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -250,7 +250,6 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges end # Disable semi-concrete interpretation (broken with overlays per JuliaLang/julia#47349) -# and block concrete eval for intrinsics (not_callable() bodies return dummy values). function CC.concrete_eval_eligible(interp::cuTileInterpreter, @nospecialize(f), result::CC.MethodCallResult, arginfo::CC.ArgInfo, sv::CC.InferenceState) ret = @invoke CC.concrete_eval_eligible(interp::CC.AbstractInterpreter, @@ -258,9 +257,6 @@ function CC.concrete_eval_eligible(interp::cuTileInterpreter, if ret === :semi_concrete_eval return :none end - if ret === :concrete_eval && isintrinsic(f) - return :none - end return ret end From ccd6572771b3c8107a0b510212f7837f95093921 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Feb 2026 11:36:43 +0100 Subject: [PATCH 17/17] Remove fragile tests. --- test/codegen/integration.jl | 109 ------------------------------------ 1 file changed, 109 deletions(-) diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl index 546e6e9..a1503dc 100644 --- a/test/codegen/integration.jl +++ b/test/codegen/integration.jl @@ -1119,112 +1119,3 @@ end end end end - -#============================================================================= - Literal SSA Statement Handling -=============================================================================# - -@testset "Statement emission edge cases" begin - # Regression tests: certain IR statement forms (literal values from constant - # folding, :boundscheck expressions from inlined @boundscheck blocks) must - # register CGVals so downstream SSAValue references resolve. - - spec = ct.ArraySpec{1}(16, true) - - function _literal_test_kernel(a::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - ct.store(a, pid, tile) - return - end - - function _find_intrinsic_call(ir, callee) - for (i, inst) in enumerate(ir.stmts) - stmt = inst[:stmt] - if stmt isa Expr && stmt.head === :call && length(stmt.args) >= 1 - if stmt.args[1] === callee - return i - end - end - end - return nothing - end - - function _test_literal_ssa(value) - argtypes = Tuple{ct.TileArray{Float32,1,spec}} - world = Base.get_world_counter() - mi = something( - ct.method_instance(_literal_test_kernel, argtypes; world, - method_table=ct.cuTileMethodTable), - ct.method_instance(_literal_test_kernel, argtypes; world)) - ir, _ = ct.code_ircode(mi) - - # Replace first subi(pid, 1) with a literal — simulates constant folding - idx = _find_intrinsic_call(ir, ct.Intrinsics.subi) - @assert idx !== nothing "test setup: could not find subi call in IR" - ir.stmts[idx][:stmt] = value - - sci = ct.StructuredIRCode(ir) - bytecode = ct.write_bytecode!(1) do writer, func_buf - ct.emit_kernel!(writer, func_buf, sci, Nothing; - name="literal_test", - cache=ct.CacheView{ct.CuTileResults}( - (:cuTile, (sm_arch=nothing, opt_level=3, - num_ctas=nothing, occupancy=nothing)), world)) - end - return length(bytecode) > 0 - end - - @testset "Int32 zero literal" begin - @test _test_literal_ssa(Int32(0)) - end - - @testset "Int32 nonzero literal" begin - @test _test_literal_ssa(Int32(42)) - end - - @testset "Expr(:boundscheck) registers CGVal" begin - # Regression test: Expr(:boundscheck) must emit a Bool constant so that - # downstream SSAValue references (e.g., IfOp conditions) can resolve. - # Previously, emit_expr! returned nothing for :boundscheck, leaving the - # SSA slot unregistered and causing "SSAValue not found" crashes. - # - # Strategy: inject Expr(:boundscheck) at the subi position and replace - # the downstream reference with a constant so codegen completes cleanly. - argtypes = Tuple{ct.TileArray{Float32,1,spec}} - world = Base.get_world_counter() - mi = something( - ct.method_instance(_literal_test_kernel, argtypes; world, - method_table=ct.cuTileMethodTable), - ct.method_instance(_literal_test_kernel, argtypes; world)) - ir, _ = ct.code_ircode(mi) - - # Replace first subi with Expr(:boundscheck) — simulates inlined @boundscheck - idx = _find_intrinsic_call(ir, ct.Intrinsics.subi) - @assert idx !== nothing "test setup: could not find subi call in IR" - ir.stmts[idx][:stmt] = Expr(:boundscheck) - ir.stmts[idx][:type] = Bool - # Fix downstream: replace the SSAValue reference to subi with a constant - # so the load_view doesn't fail on a Bool argument - for i in (idx+1):length(ir.stmts) - stmt = ir.stmts[i][:stmt] - if stmt isa Expr - for (j, arg) in enumerate(stmt.args) - if arg === Core.SSAValue(idx) - stmt.args[j] = Int32(0) - end - end - end - end - - sci = ct.StructuredIRCode(ir) - bytecode = ct.write_bytecode!(1) do writer, func_buf - ct.emit_kernel!(writer, func_buf, sci, Nothing; - name="boundscheck_test", - cache=ct.CacheView{ct.CuTileResults}( - (:cuTile, (sm_arch=nothing, opt_level=3, - num_ctas=nothing, occupancy=nothing)), world)) - end - @test length(bytecode) > 0 - end -end