diff --git a/Project.toml b/Project.toml index dd1c4ea..6f344ac 100644 --- a/Project.toml +++ b/Project.toml @@ -8,27 +8,23 @@ projects = ["test", "examples"] [deps] BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" -CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" CUDA_Compiler_jll = "d1e2174e-dfdc-576e-b43e-73b79eb1aca8" CUDA_Tile_jll = "2068806d-a867-5dbd-af0e-42c2eb5d895d" +CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" IRStructurizer = "93e32bba-5bb8-402b-805d-ffb066edee93" [weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c" -[sources] -CompilerCaching = {url = "https://github.com/maleadt/CompilerCaching.jl", rev="main"} -IRStructurizer = {url = "https://github.com/maleadt/IRStructurizer.jl", rev = "main"} - [extensions] CUDAExt = "CUDA" DLFP8TypesExt = "DLFP8Types" [compat] -julia = "1.11" BFloat16s = "0.6" -CompilerCaching = "0.1" CUDA_Compiler_jll = "0.4" CUDA_Tile_jll = "13.1" +CompilerCaching = "0.1" IRStructurizer = "0.1" +julia = "1.11" diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl index 02b7c38..61474c2 100644 --- a/src/compiler/codegen/expressions.jl +++ b/src/compiler/codegen/expressions.jl @@ -17,7 +17,9 @@ function emit_expr!(ctx::CGCtx, expr::Expr, @nospecialize(result_type)) elseif expr.head === :foreigncall throw(IRError("Foreign calls not supported in Tile IR")) elseif expr.head === :boundscheck - return nothing + # Bounds checking is always disabled in Tile IR kernels. + # Emit false so IfOps referencing this SSA can resolve the condition. + return emit_constant!(ctx, false, Bool) else @warn "Unhandled expression head" expr.head expr return nothing @@ -79,9 +81,7 @@ function emit_call!(ctx::CGCtx, expr::Expr, @nospecialize(result_type)) func = get_constant(ctx, args[1]) call_args = args[2:end] - # TODO: This is normally dynamic dispatch, which we should allow. - # However, we currently trigger this when emitting Julia intrinsics. - # We should switch to our own intrinsics entirely, which are only invoked. + # We enter here for dynamic dispatch, but also for all intrinsic functions. @static if isdefined(Core, :throw_methoderror) if func === Core.throw_methoderror diff --git a/src/compiler/codegen/statements.jl b/src/compiler/codegen/statements.jl index 13b8c60..d073c08 100644 --- a/src/compiler/codegen/statements.jl +++ b/src/compiler/codegen/statements.jl @@ -26,9 +26,14 @@ function emit_statement!(ctx::CGCtx, @nospecialize(stmt), ssa_idx::Int, @nospeci # PiNode is a type narrowing assertion - store the resolved value tv = emit_value!(ctx, stmt) elseif stmt === nothing - # No-op + # Dead code elimination artifact β€” no value to register else - @warn "Unhandled statement type" typeof(stmt) stmt + # Literal values from constant folding or concrete eval. + # Try emit_constant! first (numbers/ghost types), fall back to emit_value!. + tv = emit_constant!(ctx, stmt, result_type) + if tv === nothing + tv = emit_value!(ctx, stmt) + end end # Store result by original Julia SSA index diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index bdc6e8a..86f8f8b 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -74,14 +74,20 @@ CC.may_compress(::cuTileInterpreter) = true CC.may_discard_trees(::cuTileInterpreter) = false #============================================================================= - Custom return-type inference (tfuncs) for intrinsics + Custom inference for intrinsics =============================================================================# -# Per-intrinsic return type overrides using multiple dispatch. +# Per-intrinsic return type overrides. # Returns nothing when no override applies (fallback). -# Concrete per-intrinsic methods are defined in intrinsics/ (after the -# Intrinsics module exists). -tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing +tfunc(𝕃, @nospecialize(f), @nospecialize args...) = nothing + +# Per-intrinsic effect overrides. +# Returns nothing when no override applies (fallback). +efunc(@nospecialize(f), effects::CC.Effects) = nothing + +# Predicate for functions defined in the Intrinsics module. +# These get NoCallInfo() so they stay as Expr(:call) rather than Expr(:invoke). +isintrinsic(@nospecialize(f)) = isa(f, Function) && parentmodule(f) === Intrinsics #============================================================================= Subprogram inference for reduce/scan @@ -172,9 +178,11 @@ end result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any, arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing}, sv::CC.InferenceState, max_methods::Int) - rt_override = tfunc(f, arginfo.argtypes) + is_intr = isintrinsic(f) + 𝕃 = CC.typeinf_lattice(interp) + rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...) subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv) - rt_override === nothing && subprog === nothing && return result + !is_intr && rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() push!(sv.tasks, function (interpβ€², svβ€²) isready(result) || return false @@ -182,8 +190,11 @@ end cm = result[] sp = subprog !== nothing ? subprog[] : nothing rt = rt_override !== nothing ? rt_override : cm.rt - info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info - wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements) + efunc_override = is_intr ? efunc(f, cm.effects) : nothing + effects = efunc_override !== nothing ? efunc_override : cm.effects + info = is_intr ? CC.NoCallInfo() : cm.info + info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info + wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) return true end) return wrapped @@ -195,9 +206,11 @@ elseif isdefined(CC, :Future) # 1.12–1.13 result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any, arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.InferenceState, max_methods::Int) - rt_override = tfunc(f, arginfo.argtypes) + is_intr = isintrinsic(f) + 𝕃 = CC.typeinf_lattice(interp) + rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...) subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv) - rt_override === nothing && subprog === nothing && return result + !is_intr && rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() push!(sv.tasks, function (interpβ€², svβ€²) isready(result) || return false @@ -205,8 +218,11 @@ elseif isdefined(CC, :Future) # 1.12–1.13 cm = result[] sp = subprog !== nothing ? subprog[] : nothing rt = rt_override !== nothing ? rt_override : cm.rt - info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info - wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements) + efunc_override = is_intr ? efunc(f, cm.effects) : nothing + effects = efunc_override !== nothing ? efunc_override : cm.effects + info = is_intr ? CC.NoCallInfo() : cm.info + info = sp !== nothing ? SubprogramCallInfo(info, sp.info) : info + wrapped[] = CC.CallMeta(rt, cm.exct, effects, info, cm.refinements) return true end) return wrapped @@ -219,10 +235,15 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int) _infer_subprogram(interp, f, arginfo, si, nothing, sv) # side-effect only - rt_override = tfunc(f, arginfo.argtypes) - if rt_override !== nothing - return CC.CallMeta(rt_override, result.exct, result.effects, - result.info) + is_intr = isintrinsic(f) + 𝕃 = CC.typeinf_lattice(interp) + rt_override = tfunc(𝕃, f, arginfo.argtypes[2:end]...) + rt = rt_override !== nothing ? rt_override : result.rt + efunc_override = is_intr ? efunc(f, result.effects) : nothing + effects = efunc_override !== nothing ? efunc_override : result.effects + info = is_intr ? CC.NoCallInfo() : result.info + if is_intr || rt_override !== nothing + return CC.CallMeta(rt, result.exct, effects, info) end return result end diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index aa0d425..5275036 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -4,7 +4,7 @@ module Intrinsics -using Base: compilerbarrier, donotdelete +using Base: compilerbarrier, inferencebarrier using ..cuTile: Tile, TileArray, Constant, TensorView, PartitionView using ..cuTile: Signedness, SignednessSigned, SignednessUnsigned using ..cuTile: ComparisonPredicate, CmpLessThan, CmpLessThanOrEqual, CmpGreaterThan, CmpGreaterThanOrEqual, CmpEqual, CmpNotEqual @@ -12,24 +12,37 @@ using ..cuTile: IdentityVal, FloatIdentityVal, IntegerIdentityVal end -# NOTE: Due to JuliaLang/julia#60583, intrinsics may be called during constant evaluation. -# Because of that, such intrinsics (such as basic arithmetic) need to provide an -# implementation that actually computes a valid result using Julia intrinsics. -# -# Sometimes that's not possible, e.g., because the functionality required for that is -# overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those -# intrinsics we disable constant folding using a `compilerbarrier(:const)` -# -# NOTE: Side-effectful intrinsics (stores, atomics) use `donotdelete(args...)` in their -# bodies to prevent the optimizer from DCE'ing calls. `donotdelete` is a Julia builtin -# with `effect_free=ALWAYS_FALSE`, which inference propagates through the function body. -# `@assume_effects !:effect_free` does NOT work β€” `override_effects` can only strengthen -# effects (set ALWAYS_TRUE), not weaken them. Spoofing `ipo_effects` via a custom -# `CC.finish!` override is possible but fragile (must race against `finishinfer!` setting -# `use_const_api` based on pre-override effects). `donotdelete` is the simplest correct -# approach. +""" + @intrinsic signature -emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing +Define a Tile IR intrinsic in the `Intrinsics` module. These intrinsics are +defined to return `Any`, so need additional `tfunc` and `efunc` definitions +to specify their behavior. +""" +macro intrinsic(ex) + body = quote + compilerbarrier(:type, nothing) + end + funcdef = Expr(:function, ex, body) + funcdef = Expr(:macrocall, Symbol("@noinline"), nothing, funcdef) + return esc(:(Core.eval(Intrinsics, $(QuoteNode(funcdef))))) +end + +""" + instanceof_tfunc(lat) -> Type or nothing + +Extract `T` from a lattice element representing `Type{T}`. +Simplified version of `Base.Compiler.instanceof_tfunc` that handles `Const(T)` +and `Type{T}` lattice elements. Returns `nothing` when `T` cannot be determined. +""" +function instanceof_tfunc(@nospecialize(lat)) + if isa(lat, CC.Const) + val = lat.val + return val isa Type ? val : nothing + end + tgt = CC.widenconst(lat) + return tgt isa DataType && tgt <: Type && !isempty(tgt.parameters) ? tgt.parameters[1] : nothing +end # Shared helper for creating load/store optimization hints function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, allow_tma::Bool=true) @@ -39,6 +52,8 @@ function create_optimization_hints(ctx::CGCtx, latency::Union{Int, Nothing}, all return make_load_store_hints(ctx.sm_arch, hints) end +emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing + include("intrinsics/core.jl") include("intrinsics/conversions.jl") include("intrinsics/arithmetic.jl") diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 6272251..3aba1c6 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -84,53 +84,40 @@ end ## Integer arithmetic # cuda_tile.absi -@eval Intrinsics begin - """Integer absolute value. Compiled to cuda_tile.absi.""" - @noinline absi(x::T) where {T<:Integer} = - ifelse(Core.Intrinsics.slt_int(x, zero(T)), Core.Intrinsics.neg_int(x), x) - @noinline absi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a) -end +@intrinsic absi(x::Integer) +@intrinsic absi(x::Tile{<:Integer}) +tfunc(𝕃, ::typeof(Intrinsics.absi), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absi), args) emit_unop!(ctx, args, encode_AbsIOp!) end # cuda_tile.addi -@eval Intrinsics begin - @noinline addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y) - @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() -end +@intrinsic addi(x::T, y::T) where {T<:Integer} +@intrinsic addi(a::Tile{T}, b::Tile{T}) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.addi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args) emit_binop!(ctx, args, encode_AddIOp!) end # cuda_tile.cldi (ceiling division, toward positive infinity) -@eval Intrinsics begin - @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T)) -end +@intrinsic cldi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic cldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.cldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingPositiveInf) end # cuda_tile.cmpi -@eval Intrinsics begin - @noinline function cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} - if pred === CmpLessThan - s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - elseif pred === CmpLessThanOrEqual - s === SignednessSigned ? Core.Intrinsics.sle_int(x, y) : Core.Intrinsics.ule_int(x, y) - elseif pred === CmpGreaterThan - s === SignednessSigned ? Core.Intrinsics.slt_int(y, x) : Core.Intrinsics.ult_int(y, x) - elseif pred === CmpGreaterThanOrEqual - s === SignednessSigned ? Core.Intrinsics.sle_int(y, x) : Core.Intrinsics.ule_int(y, x) - elseif pred === CmpEqual - Core.Intrinsics.eq_int(x, y) - else # CmpNotEqual - Core.Intrinsics.ne_int(x, y) - end +@intrinsic cmpi(x::T, y::T, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} +@intrinsic cmpi(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate, s::Signedness) where {T<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.cmpi), @nospecialize(x), @nospecialize(y), @nospecialize(pred), @nospecialize(s)) + t = CC.widenconst(x) + if t <: Tile + S = t.parameters[2] + return Tile{Bool, S} end - @noinline cmpi(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate, ::Signedness) where {T<:Integer, S} = - Tile{Bool, S}() + return Bool end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) cb = ctx.cb @@ -156,118 +143,95 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) end # cuda_tile.divi (truncating division, toward zero) -@eval Intrinsics begin - @noinline function divi(x::T, y::T, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.sdiv_int(x, y) : Core.Intrinsics.udiv_int(x, y) - end -end +@intrinsic divi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic divi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.divi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("divi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingZero) end # cuda_tile.fldi (floor division, toward negative infinity) -@eval Intrinsics begin - @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T)) -end +@intrinsic fldi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic fldi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.fldi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness")) emit_binop!(ctx, args, encode_DivIOp!; signedness, rounding=RoundingNegativeInf) end # cuda_tile.maxi -@eval Intrinsics begin - @noinline function maxi(x::T, y::T, s::Signedness) where {T<:Integer} - lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - ifelse(lt, y, x) - end - @noinline maxi(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = - Tile{T, S}() -end +@intrinsic maxi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic maxi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.maxi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness")) emit_binop!(ctx, args, encode_MaxIOp!; signedness) end # cuda_tile.mini -@eval Intrinsics begin - @noinline function mini(x::T, y::T, s::Signedness) where {T<:Integer} - lt = s === SignednessSigned ? Core.Intrinsics.slt_int(x, y) : Core.Intrinsics.ult_int(x, y) - ifelse(lt, x, y) - end - @noinline mini(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = - Tile{T, S}() -end +@intrinsic mini(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic mini(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.mini), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness")) emit_binop!(ctx, args, encode_MinIOp!; signedness) end # cuda_tile.muli -@eval Intrinsics begin - @noinline muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) - @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() -end +@intrinsic muli(x::T, y::T) where {T<:Integer} +@intrinsic muli(a::Tile{T}, b::Tile{T}) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.muli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args) emit_binop!(ctx, args, encode_MulIOp!) end # cuda_tile.mulhii -@eval Intrinsics begin - """High bits of integer multiply (for extended precision arithmetic). Compiled to cuda_tile.mulhii.""" - @noinline function mulhii(x::T, y::T, s::Signedness) where {T<:Integer} - ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T - end - @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = Tile{T, S}() -end +@intrinsic mulhii(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic mulhii(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.mulhii), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args) emit_binop!(ctx, args, encode_MulhiIOp!) end # cuda_tile.negi -@eval Intrinsics begin - @noinline negi(x::T) where {T<:Integer} = Core.Intrinsics.neg_int(x) - @noinline negi(a::Tile{T, S}) where {T<:Integer, S} = compilerbarrier(:const, a) -end +@intrinsic negi(x::T) where {T<:Integer} +@intrinsic negi(a::Tile{<:Integer}) +tfunc(𝕃, ::typeof(Intrinsics.negi), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negi), args) emit_unop!(ctx, args, encode_NegIOp!; overflow=OverflowNone) end # cuda_tile.remi -@eval Intrinsics begin - @noinline function remi(x::T, y::T, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.srem_int(x, y) : Core.Intrinsics.urem_int(x, y) - end -end +@intrinsic remi(x::T, y::T, s::Signedness) where {T<:Integer} +@intrinsic remi(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.remi), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("remi requires compile-time signedness")) emit_binop!(ctx, args, encode_RemIOp!; signedness) end # cuda_tile.shli -@eval Intrinsics begin - @noinline shli(x::T, y::Integer) where {T<:Integer} = Core.Intrinsics.shl_int(x, y % T) -end +@intrinsic shli(x::T, y::Integer) where {T<:Integer} +@intrinsic shli(a::Tile{T}, b::Tile{T}) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.shli), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shli), args) emit_binop!(ctx, args, encode_ShLIOp!) end # cuda_tile.shri -@eval Intrinsics begin - @noinline function shri(x::T, y::Integer, s::Signedness) where {T<:Integer} - s === SignednessSigned ? Core.Intrinsics.ashr_int(x, y % T) : Core.Intrinsics.lshr_int(x, y % T) - end -end +@intrinsic shri(x::T, y::Integer, s::Signedness) where {T<:Integer} +@intrinsic shri(a::Tile{T}, b::Tile{T}, s::Signedness) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.shri), @nospecialize(x), @nospecialize(y), @nospecialize(s)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.shri), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("shri requires compile-time signedness")) emit_binop!(ctx, args, encode_ShRIOp!; signedness) end # cuda_tile.subi -@eval Intrinsics begin - @noinline subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) - @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() -end +@intrinsic subi(x::T, y::T) where {T<:Integer} +@intrinsic subi(a::Tile{T}, b::Tile{T}) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.subi), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args) emit_binop!(ctx, args, encode_SubIOp!) end @@ -276,42 +240,31 @@ end ## Floating-point arithmetic # cuda_tile.absf -@eval Intrinsics begin - @noinline absf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.abs_float(x) - @noinline absf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a) -end +@intrinsic absf(x::T) where {T<:AbstractFloat} +@intrinsic absf(a::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.absf), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.absf), args) emit_unop!(ctx, args, encode_AbsFOp!) end # cuda_tile.addf -@eval Intrinsics begin - @noinline addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y) - @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic addf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic addf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.addf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args) emit_binop!(ctx, args, encode_AddFOp!) end # cuda_tile.cmpf -@eval Intrinsics begin - @noinline function cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} - if pred === CmpLessThan - Core.Intrinsics.lt_float(x, y) - elseif pred === CmpLessThanOrEqual - Core.Intrinsics.le_float(x, y) - elseif pred === CmpGreaterThan - Core.Intrinsics.lt_float(y, x) - elseif pred === CmpGreaterThanOrEqual - Core.Intrinsics.le_float(y, x) - elseif pred === CmpEqual - Core.Intrinsics.eq_float(x, y) - else # CmpNotEqual - Core.Intrinsics.ne_float(x, y) - end +@intrinsic cmpf(x::T, y::T, pred::ComparisonPredicate) where {T<:AbstractFloat} +@intrinsic cmpf(a::Tile{T}, b::Tile{T}, pred::ComparisonPredicate) where {T<:AbstractFloat} +function tfunc(𝕃, ::typeof(Intrinsics.cmpf), @nospecialize(x), @nospecialize(y), @nospecialize(pred)) + t = CC.widenconst(x) + if t <: Tile + S = t.parameters[2] + return Tile{Bool, S} end - @noinline cmpf(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate) where {T<:AbstractFloat, S} = - Tile{Bool, S}() + return Bool end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) cb = ctx.cb @@ -336,37 +289,33 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) end # cuda_tile.divf -@eval Intrinsics begin - @noinline divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y) - @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic divf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic divf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.divf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args) emit_binop!(ctx, args, encode_DivFOp!) end # cuda_tile.mulf -@eval Intrinsics begin - @noinline mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y) - @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic mulf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic mulf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.mulf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args) emit_binop!(ctx, args, encode_MulFOp!) end # cuda_tile.negf -@eval Intrinsics begin - @noinline negf(x::T) where {T<:AbstractFloat} = Core.Intrinsics.neg_float(x) - @noinline negf(a::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, a) -end +@intrinsic negf(x::T) where {T<:AbstractFloat} +@intrinsic negf(a::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.negf), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.negf), args) emit_unop!(ctx, args, encode_NegFOp!) end # cuda_tile.subf -@eval Intrinsics begin - @noinline subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y) - @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic subf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic subf(a::Tile{T}, b::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.subf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args) emit_binop!(ctx, args, encode_SubFOp!) end @@ -375,10 +324,15 @@ end ## Boolean arithmetic # cuda_tile.andi -@eval Intrinsics begin - @noinline andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) - """Element-wise logical AND for boolean tiles.""" - @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() +@intrinsic andi(x::T, y::T) where {T<:Integer} +@intrinsic andi(a::Tile{T}, b::Tile{T}) where {T<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.andi), @nospecialize(x), @nospecialize(y)) + if isa(x, CC.Const) && x.val === false && CC.widenconst(y) === Bool + return CC.Const(false) + elseif isa(y, CC.Const) && y.val === false && CC.widenconst(x) === Bool + return CC.Const(false) + end + return CC.widenconst(x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) cb = ctx.cb @@ -396,10 +350,15 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) end # cuda_tile.ori -@eval Intrinsics begin - @noinline ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y) - """Element-wise logical OR for boolean tiles.""" - @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() +@intrinsic ori(x::T, y::T) where {T<:Integer} +@intrinsic ori(a::Tile{T}, b::Tile{T}) where {T<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.ori), @nospecialize(x), @nospecialize(y)) + if isa(x, CC.Const) && x.val === true && CC.widenconst(y) === Bool + return CC.Const(true) + elseif isa(y, CC.Const) && y.val === true && CC.widenconst(x) === Bool + return CC.Const(true) + end + return CC.widenconst(x) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) cb = ctx.cb @@ -417,11 +376,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) end # cuda_tile.xori -@eval Intrinsics begin - @noinline xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y) - """Element-wise logical XOR for boolean tiles.""" - @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() -end +@intrinsic xori(x::T, y::T) where {T<:Integer} +@intrinsic xori(a::Tile{T}, b::Tile{T}) where {T<:Integer} +tfunc(𝕃, ::typeof(Intrinsics.xori), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 3c89bd4..9c480bf 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -31,20 +31,11 @@ function memory_scope_to_scope(scope::Int) end # cuda_tile.atomic_cas_tko -@eval Intrinsics begin - """ - atomic_cas(array, index, expected, desired, memory_order, memory_scope) - - Atomic compare-and-swap at 0-indexed position. - Returns the original value. - Compiled to cuda_tile.atomic_cas_tko. - """ - @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired, - memory_order::Int, memory_scope::Int) where {T, N} - donotdelete() - compilerbarrier(:const, zero(T))::T - end -end +@intrinsic atomic_cas(array, index, expected, desired, + memory_order, memory_scope) +tfunc(𝕃, ::typeof(Intrinsics.atomic_cas), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) +efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args) cb = ctx.cb tt = ctx.tt @@ -169,39 +160,20 @@ function emit_atomic_rmw!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) end # cuda_tile.atomic_rmw_tko with XCHG -@eval Intrinsics begin - """ - atomic_xchg(array, index, val, memory_order, memory_scope) - - Atomic exchange at 0-indexed position. - Returns the original value. - Compiled to cuda_tile.atomic_rmw_tko with XCHG. - """ - @noinline function atomic_xchg(array::TileArray{T, N}, index, val, - memory_order::Int, memory_scope::Int) where {T, N} - donotdelete() - compilerbarrier(:const, zero(T)) - end -end +@intrinsic atomic_xchg(array, index, val, memory_order, memory_scope) +tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) +efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) emit_atomic_rmw!(ctx, args, AtomicXCHG) end # cuda_tile.atomic_rmw_tko with ADD -@eval Intrinsics begin - """ - atomic_add(array, index, val, memory_order, memory_scope) - - Atomic addition at 0-indexed position. - Returns the original value. - Compiled to cuda_tile.atomic_rmw_tko with ADD. - """ - @noinline function atomic_add(array::TileArray{T, N}, index, val, - memory_order::Int, memory_scope::Int) where {T, N} - donotdelete() - compilerbarrier(:const, zero(T)) - end -end +@intrinsic atomic_add(array, index, val, + memory_order, memory_scope) +tfunc(𝕃, ::typeof(Intrinsics.atomic_add), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) +efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) emit_atomic_rmw!(ctx, args, AtomicADD) end diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 6c33afc..e302063 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -3,10 +3,12 @@ # TODO: cuda_tile.bitcast # cuda_tile.exti (scalar integer extension) -@eval Intrinsics begin - @noinline function exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} - s === SignednessSigned ? Core.Intrinsics.sext_int(T, x) : Core.Intrinsics.zext_int(T, x) - end +@intrinsic exti(x::I, ::Type{T}, s::Signedness) where {I<:Integer, T<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.exti), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) + T = instanceof_tfunc(target_type) + T === nothing && return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args) cb = ctx.cb @@ -26,10 +28,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exti), args) end # cuda_tile.ftof (scalar float to float) -@eval Intrinsics begin - @noinline function ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} - sizeof(F2) > sizeof(F1) ? Core.Intrinsics.fpext(F2, x) : Core.Intrinsics.fptrunc(F2, x) - end +@intrinsic ftof(x::F1, ::Type{F2}) where {F1<:AbstractFloat, F2<:AbstractFloat} +function tfunc(𝕃, ::typeof(Intrinsics.ftof), @nospecialize(x), @nospecialize(target_type)) + T = instanceof_tfunc(target_type) + T === nothing && return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args) cb = ctx.cb @@ -48,10 +52,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftof), args) end # cuda_tile.ftoi (scalar float to integer) -@eval Intrinsics begin - @noinline function ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} - s === SignednessSigned ? Core.Intrinsics.fptosi(I, x) : Core.Intrinsics.fptoui(I, x) - end +@intrinsic ftoi(x::AbstractFloat, ::Type{I}, s::Signedness) where {I<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.ftoi), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) + T = instanceof_tfunc(target_type) + T === nothing && return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args) cb = ctx.cb @@ -71,10 +77,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ftoi), args) end # cuda_tile.itof (scalar integer to float) -@eval Intrinsics begin - @noinline function itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} - s === SignednessSigned ? Core.Intrinsics.sitofp(F, x) : Core.Intrinsics.uitofp(F, x) - end +@intrinsic itof(x::Integer, ::Type{F}, s::Signedness) where {F<:AbstractFloat} +function tfunc(𝕃, ::typeof(Intrinsics.itof), @nospecialize(x), @nospecialize(target_type), @nospecialize(s)) + T = instanceof_tfunc(target_type) + T === nothing && return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args) cb = ctx.cb @@ -94,8 +102,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.itof), args) end # cuda_tile.trunci (scalar integer truncation) -@eval Intrinsics begin - @noinline trunci(x::Integer, ::Type{T}) where {T<:Integer} = Core.Intrinsics.trunc_int(T, x) +@intrinsic trunci(x::Integer, ::Type{T}) where {T<:Integer} +function tfunc(𝕃, ::typeof(Intrinsics.trunci), @nospecialize(x), @nospecialize(target_type)) + T = instanceof_tfunc(target_type) + T === nothing && return nothing + src = CC.widenconst(x) + src <: Tile ? similar_type(src, T) : T end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.trunci), args) cb = ctx.cb diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index b64fbcf..306d13a 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -19,22 +19,11 @@ function validate_tile_shape(shape, context::String) end # cuda_tile.broadcast -@eval Intrinsics begin - """ - broadcast(tile, shape_val) - - Explicitly broadcast a tile to a target shape. - Compiled to cuda_tile.broadcast. - """ - @noinline function broadcast(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tile_type = CC.widenconst(argtypes[2]) +@intrinsic broadcast(tile, shape) +function tfunc(𝕃, ::typeof(Intrinsics.broadcast), @nospecialize(tile), @nospecialize(shape_arg)) + tile_type = CC.widenconst(tile) tile_type <: Tile || return nothing - shape_arg = argtypes[3] + shape_arg = shape_arg isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tile_type) @@ -109,22 +98,10 @@ function broadcast_tile_to_shape!(cb::CodeBuilder, tt::TypeTable, tv::CGVal, end # cuda_tile.cat -@eval Intrinsics begin - """ - cat(tiles, axis_val) - - Concatenate two tiles along 0-indexed axis. - Compiled to cuda_tile.cat. - """ - @noinline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Integer) where {T, S1, S2} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tuple_type = CC.widenconst(argtypes[2]) +@intrinsic cat(tiles, axis) +function tfunc(𝕃, ::typeof(Intrinsics.cat), @nospecialize(tiles), @nospecialize(axis_arg)) + tuple_type = CC.widenconst(tiles) tuple_type <: Tuple{Tile, Tile} || return nothing - axis_arg = argtypes[3] isa(axis_arg, CC.Const) || return nothing axis = axis_arg.val t1_type = tuple_type.parameters[1] @@ -186,25 +163,12 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args) end # cuda_tile.constant -@eval Intrinsics begin - """ - constant(shape, value, T) - - Create a tile filled with a constant value. - Compiled to cuda_tile.constant. - """ - @noinline function constant(shape::NTuple{N, Int}, value, ::Type{T}) where {N, T} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any}) - length(argtypes) >= 4 || return nothing - shape_arg = argtypes[2] +@intrinsic constant(shape, value, T) +function tfunc(𝕃, ::typeof(Intrinsics.constant), @nospecialize(shape_arg), @nospecialize(value), @nospecialize(type_arg_lat)) isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val - type_arg = CC.widenconst(argtypes[4]) - type_arg <: Type || return nothing - T = type_arg.parameters[1] + T = instanceof_tfunc(type_arg_lat) + T === nothing && return nothing return Tile{T, Tuple{shape...}} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.constant), args) @@ -236,22 +200,10 @@ end # TODO: cuda_tile.entry # cuda_tile.extract -@eval Intrinsics begin - """ - extract(tile, index_val, shape_val) - - Extract a sub-tile from tile at 0-indexed slice indices. - Compiled to cuda_tile.extract. - """ - @noinline function extract(tile::Tile{T}, index::NTuple{N, Int}, shape::NTuple{N, Int}) where {T, N} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any}) - length(argtypes) >= 4 || return nothing - tile_type = CC.widenconst(argtypes[2]) +@intrinsic extract(tile, index, shape) +function tfunc(𝕃, ::typeof(Intrinsics.extract), @nospecialize(tile_lat), @nospecialize(index), @nospecialize(shape_arg)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing - shape_arg = argtypes[4] isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tile_type) @@ -300,15 +252,8 @@ end # TODO: cuda_tile.get_global # cuda_tile.get_num_tile_blocks -@eval Intrinsics begin - """ - get_num_tile_blocks(axis)::Int32 - - Get the grid size along the given axis (0=x, 1=y, 2=z). - Compiled to cuda_tile.get_num_tile_blocks. - """ - @noinline get_num_tile_blocks(axis::Integer) = compilerbarrier(:const, zero(Int32)) -end +@intrinsic get_num_tile_blocks(axis) +tfunc(𝕃, ::typeof(Intrinsics.get_num_tile_blocks), @nospecialize(axis)) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), args) axis = @something get_constant(ctx, args[1]) throw(IRError("get_num_tile_blocks() axis must be a compile-time constant")) axis in (0, 1, 2) || throw(IRError("get_num_tile_blocks() axis must be 0, 1, or 2, got $axis")) @@ -320,15 +265,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_num_tile_blocks), a end # cuda_tile.get_tile_block_id -@eval Intrinsics begin - """ - get_tile_block_id(axis)::Int32 - - Get the block ID along the given axis (0=x, 1=y, 2=z). - Compiled to cuda_tile.get_tile_block_id. - """ - @noinline get_tile_block_id(axis::Integer) = compilerbarrier(:const, zero(Int32)) -end +@intrinsic get_tile_block_id(axis) +tfunc(𝕃, ::typeof(Intrinsics.get_tile_block_id), @nospecialize(axis)) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_tile_block_id), args) axis = @something get_constant(ctx, args[1]) throw(IRError("get_tile_block_id() axis must be a compile-time constant")) axis in (0, 1, 2) || throw(IRError("get_tile_block_id() axis must be 0, 1, or 2, got $axis")) @@ -343,25 +281,12 @@ end # TODO: cuda_tile.global # cuda_tile.iota -@eval Intrinsics begin - """ - iota(shape, T) - - Create a 1D tile with values [0, 1, 2, ..., shape[1]-1] (0-indexed). - Compiled to cuda_tile.iota. - """ - @noinline function iota(shape::NTuple{1, Int}, ::Type{T}) where {T} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - shape_arg = argtypes[2] +@intrinsic iota(shape, T) +function tfunc(𝕃, ::typeof(Intrinsics.iota), @nospecialize(shape_arg), @nospecialize(type_arg_lat)) isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val - type_arg = CC.widenconst(argtypes[3]) - type_arg <: Type || return nothing - T = type_arg.parameters[1] + T = instanceof_tfunc(type_arg_lat) + T === nothing && return nothing return Tile{T, Tuple{shape...}} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args) @@ -387,17 +312,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args) end # cuda_tile.mmaf, cuda_tile.mmai -@eval Intrinsics begin - """ - mma(a, b, acc) - - Matrix-multiply-accumulate: result = a @ b + acc. - Compiled to cuda_tile.mmaf or cuda_tile.mmai. - """ - @noinline function mma(a::Tile{T1}, b::Tile{T2}, acc::Tile{T3, SC}) where {T1, T2, T3, SC} - Tile{T3, SC}() - end -end +@intrinsic mma(a::Tile, b::Tile, acc::Tile) +tfunc(𝕃, ::typeof(Intrinsics.mma), @nospecialize(a), @nospecialize(b), @nospecialize(acc)) = CC.widenconst(acc) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mma), args) cb = ctx.cb @@ -415,16 +331,15 @@ end # TODO: cuda_tile.module # cuda_tile.offset -@eval Intrinsics begin - """ - offset(base, offsets) - - Compute base_ptr + offsets for each element of offsets tile (element-scaled). - Returns a tile of pointers. Compiled to cuda_tile.offset. - """ - @noinline function offset(base::Ptr{T}, offsets::Tile{I, S}) where {T, I <: Integer, S} - Tile{Ptr{T}, S}() - end +@intrinsic offset(base, offsets) +function tfunc(𝕃, ::typeof(Intrinsics.offset), @nospecialize(base), @nospecialize(offsets)) + base_type = CC.widenconst(base) + base_type <: Ptr || return nothing + offsets_type = CC.widenconst(offsets) + offsets_type <: Tile || return nothing + T = eltype(base_type) + S = offsets_type.parameters[2] + return Tile{Ptr{T}, S} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.offset), args) cb = ctx.cb @@ -469,22 +384,10 @@ end # TODO: cudatile.pack # cuda_tile.permute -@eval Intrinsics begin - """ - permute(tile, perm_val) - - Permute tile dimensions according to 0-indexed permutation. - Compiled to cuda_tile.permute. - """ - @noinline function permute(tile::Tile{T, S}, perm::NTuple{N, Int}) where {T, S, N} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tile_type = CC.widenconst(argtypes[2]) +@intrinsic permute(tile, perm) +function tfunc(𝕃, ::typeof(Intrinsics.permute), @nospecialize(tile_lat), @nospecialize(perm_arg)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing - perm_arg = argtypes[3] isa(perm_arg, CC.Const) || return nothing perm = perm_arg.val s = size(tile_type) @@ -529,20 +432,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args) end # cuda_tile.transpose -@eval Intrinsics begin - """ - transpose(tile) - - Transpose a 2D tile, swapping its dimensions. - Compiled to cuda_tile.permute with perm=(1, 0). - """ - @noinline function transpose(tile::Tile{T}) where {T} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - tile_type = CC.widenconst(argtypes[2]) +@intrinsic transpose(tile) +function tfunc(𝕃, ::typeof(Intrinsics.transpose), @nospecialize(tile_lat)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing s = size(tile_type) isempty(s) && return nothing @@ -576,29 +468,10 @@ end # cuda_tile.reduce -@eval Intrinsics begin - """ - reduce(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple) -> Tuple{Tile...} - - Reduce tiles along a 0-indexed axis using combiner `f` with per-operand - identity values. Accepts and returns tuples of tiles; single-operand - callers wrap in 1-tuples and unwrap with `[1]`. - Compiled to cuda_tile.reduce. - """ - @noinline function reduce(tiles::Tuple{Tile{T, S}}, axis::Integer, f, - identities::Tuple{Any}) where {T, S} - compilerbarrier(:type, nothing) - end - @noinline function reduce(tiles::Tuple{Tile{T1, S}, Tile{T2, S}}, axis::Integer, f, - identities::Tuple{Any, Any}) where {T1, T2, S} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tuple_type = CC.widenconst(argtypes[2]) +@intrinsic reduce(tiles, axis, f, identities) +function tfunc(𝕃, ::typeof(Intrinsics.reduce), @nospecialize(tiles), @nospecialize(axis_arg), @nospecialize args...) + tuple_type = CC.widenconst(tiles) tuple_type isa DataType && tuple_type <: Tuple || return nothing - axis_arg = argtypes[3] isa(axis_arg, CC.Const) || return nothing axis = axis_arg.val result_params = Any[] @@ -724,22 +597,10 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer = IntegerIdentityVal(to_uint128(T(val)), dtype, T) # cuda_tile.reshape -@eval Intrinsics begin - """ - reshape(tile, shape_val) - - Reshape a tile to a new shape (same total elements). - Compiled to cuda_tile.reshape. - """ - @noinline function reshape(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tile_type = CC.widenconst(argtypes[2]) +@intrinsic reshape(tile, shape) +function tfunc(𝕃, ::typeof(Intrinsics.reshape), @nospecialize(tile_lat), @nospecialize(shape_arg)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing - shape_arg = argtypes[3] isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tile_type) @@ -803,24 +664,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args) end # cuda_tile.scan -@eval Intrinsics begin - """ - scan(tiles::Tuple{Tile...}, Val(axis), f, identities::Tuple, reverse=false) -> Tuple{Tile...} - - Parallel prefix scan along a 0-indexed axis using combiner `f` with - per-operand identity values. Accepts and returns tuples of tiles; - single-operand callers wrap in 1-tuples and unwrap with `[1]`. - `reverse=true` for a reverse (suffix) scan. - Compiled to cuda_tile.scan. - """ - @noinline function scan(tiles::Tuple{Tile{T, S}}, axis::Integer, f, - identities::Tuple{Any}, reverse::Bool=false) where {T, S} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - tuple_type = CC.widenconst(argtypes[2]) +@intrinsic scan(tiles, axis, f, identities, reverse=false) +function tfunc(𝕃, ::typeof(Intrinsics.scan), @nospecialize(tiles), @nospecialize args...) + tuple_type = CC.widenconst(tiles) tuple_type isa DataType && tuple_type <: Tuple || return nothing result_params = Any[] for p in tuple_type.parameters @@ -916,17 +762,19 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) end # cuda_tile.select -@eval Intrinsics begin - """ - select(cond, x, y) - - Element-wise conditional selection. - Compiled to cuda_tile.select. - """ - @noinline select(cond::Bool, x::T, y::T) where {T} = Core.ifelse(cond, x, y) - @noinline function select(cond::Tile{Bool, S}, x::Tile{T, S}, y::Tile{T, S}) where {T, S} - Tile{T, S}() +@intrinsic select(cond::Bool, x::T, y::T) where {T}# = Core.ifelse(cond, x, y) +@intrinsic select(cond::Tile{Bool}, x::T, y::T) where {T} +function tfunc(𝕃, ::typeof(Intrinsics.select), @nospecialize(cond), @nospecialize(x), @nospecialize(y)) + if isa(cond, CC.Const) + if cond.val === true + return x + elseif cond.val === false + return y + else + return Union{} + end end + return CC.tmerge(𝕃, x, y) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.select), args) cb = ctx.cb @@ -947,21 +795,16 @@ end # These are codegen-only reinterpret intrinsics for map(f, tile). # to_scalar: jltype becomes scalar T (for overlay dispatch), but IR value stays shaped. # from_scalar: restores jltype to Tile{T, S}. -@eval Intrinsics begin - @noinline to_scalar(tile::Tile{T, S}) where {T, S} = compilerbarrier(:type, nothing) - @noinline from_scalar(x::T, ::Type{S}) where {T, S} = Tile{T, S}() -end -function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - T = CC.widenconst(argtypes[2]) - shape_type = CC.widenconst(argtypes[3]) - shape_type <: Type || return nothing - S = shape_type.parameters[1] +@intrinsic to_scalar(tile) +@intrinsic from_scalar(x, S) +function tfunc(𝕃, ::typeof(Intrinsics.from_scalar), @nospecialize(x), @nospecialize(S_lat)) + T = CC.widenconst(x) + S = instanceof_tfunc(S_lat) + S === nothing && return nothing return Tile{T, S} end -function tfunc(::typeof(Intrinsics.to_scalar), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - tile_type = CC.widenconst(argtypes[2]) +function tfunc(𝕃, ::typeof(Intrinsics.to_scalar), @nospecialize(tile_lat)) + tile_type = CC.widenconst(tile_lat) tile_type <: Tile || return nothing return eltype(tile_type) end diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl index ded13df..519cf1e 100644 --- a/src/compiler/intrinsics/math.jl +++ b/src/compiler/intrinsics/math.jl @@ -3,41 +3,33 @@ ## Floating-point math # cuda_tile.ceil -@eval Intrinsics begin - """Ceiling (round toward positive infinity). Compiled to cuda_tile.ceil.""" - @noinline ceil(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline ceil(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic ceil(x::AbstractFloat) +@intrinsic ceil(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.ceil), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ceil), args) emit_unop!(ctx, args, encode_CeilOp!) end # cuda_tile.cos -@eval Intrinsics begin - """Cosine. Compiled to cuda_tile.cos.""" - @noinline cos(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline cos(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic cos(x::AbstractFloat) +@intrinsic cos(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.cos), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cos), args) emit_unop!(ctx, args, encode_CosOp!) end # cuda_tile.cosh -@eval Intrinsics begin - """Hyperbolic cosine. Compiled to cuda_tile.cosh.""" - @noinline cosh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline cosh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic cosh(x::AbstractFloat) +@intrinsic cosh(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.cosh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cosh), args) emit_unop!(ctx, args, encode_CosHOp!) end # cuda_tile.exp2 -@eval Intrinsics begin - """Base-2 exponential (2^x). Compiled to cuda_tile.exp2.""" - @noinline exp2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline exp2(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic exp2(x::AbstractFloat, flush_to_zero::Bool=false) +@intrinsic exp2(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false) +tfunc(𝕃, ::typeof(Intrinsics.exp2), @nospecialize(x), @nospecialize args...) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) cb = ctx.cb @@ -52,11 +44,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp2), args) end # cuda_tile.exp -@eval Intrinsics begin - """Natural exponential (e^x). Compiled to cuda_tile.exp.""" - @noinline exp(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline exp(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic exp(x::AbstractFloat) +@intrinsic exp(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.exp), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) cb = ctx.cb @@ -69,21 +59,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.exp), args) end # cuda_tile.floor -@eval Intrinsics begin - """Floor (round toward negative infinity). Compiled to cuda_tile.floor.""" - @noinline floor(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline floor(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic floor(x::AbstractFloat) +@intrinsic floor(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.floor), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.floor), args) emit_unop!(ctx, args, encode_FloorOp!) end # cuda_tile.fma -@eval Intrinsics begin - """Fused multiply-add: a * b + c. Compiled to cuda_tile.fma.""" - @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic fma(x::T, y::T, z::T) where {T<:AbstractFloat} +@intrinsic fma(x::Tile{T}, y::Tile{T}, z::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.fma), @nospecialize(x), @nospecialize(y), @nospecialize(z)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) cb = ctx.cb @@ -99,11 +85,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) end # cuda_tile.log2 -@eval Intrinsics begin - """Base-2 logarithm. Compiled to cuda_tile.log2.""" - @noinline log2(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline log2(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic log2(x::AbstractFloat) +@intrinsic log2(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.log2), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) cb = ctx.cb @@ -116,11 +100,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log2), args) end # cuda_tile.log -@eval Intrinsics begin - """Element-wise natural logarithm. Compiled to cuda_tile.log.""" - @noinline log(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline log(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic log(x::AbstractFloat) +@intrinsic log(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.log), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) cb = ctx.cb @@ -133,49 +115,41 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.log), args) end # cuda_tile.maxf -@eval Intrinsics begin - @noinline maxf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x > y || isnan(x), x, y) - @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic maxf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic maxf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.maxf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args) emit_binop!(ctx, args, encode_MaxFOp!) end # cuda_tile.minf -@eval Intrinsics begin - @noinline minf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x < y || isnan(x), x, y) - @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic minf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic minf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.minf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args) emit_binop!(ctx, args, encode_MinFOp!) end # cuda_tile.pow -@eval Intrinsics begin - """Element-wise power. Compiled to cuda_tile.pow.""" - @noinline pow(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic pow(x::T, y::T) where {T<:AbstractFloat} +@intrinsic pow(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.pow), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args) emit_binop!(ctx, args, encode_PowOp!) end # cuda_tile.remf -@eval Intrinsics begin - """Element-wise floating-point remainder. Compiled to cuda_tile.remf.""" - @noinline remf(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() -end +@intrinsic remf(x::T, y::T) where {T<:AbstractFloat} +@intrinsic remf(x::Tile{T}, y::Tile{T}) where {T<:AbstractFloat} +tfunc(𝕃, ::typeof(Intrinsics.remf), @nospecialize(x), @nospecialize(y)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args) emit_binop!(ctx, args, encode_RemFOp!) end # cuda_tile.rsqrt -@eval Intrinsics begin - """Element-wise reciprocal square root. Compiled to cuda_tile.rsqrt.""" - @noinline rsqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline rsqrt(tile::Tile{T, S}, flush_to_zero::Bool=false) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic rsqrt(x::AbstractFloat, flush_to_zero::Bool=false) +@intrinsic rsqrt(x::Tile{<:AbstractFloat}, flush_to_zero::Bool=false) +tfunc(𝕃, ::typeof(Intrinsics.rsqrt), @nospecialize(x), @nospecialize args...) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) cb = ctx.cb @@ -190,31 +164,25 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.rsqrt), args) end # cuda_tile.sin -@eval Intrinsics begin - """Element-wise sine. Compiled to cuda_tile.sin.""" - @noinline sin(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline sin(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic sin(x::AbstractFloat) +@intrinsic sin(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.sin), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sin), args) emit_unop!(ctx, args, encode_SinOp!) end # cuda_tile.sinh -@eval Intrinsics begin - """Element-wise hyperbolic sine. Compiled to cuda_tile.sinh.""" - @noinline sinh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline sinh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic sinh(x::AbstractFloat) +@intrinsic sinh(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.sinh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sinh), args) emit_unop!(ctx, args, encode_SinHOp!) end # cuda_tile.sqrt -@eval Intrinsics begin - """Element-wise square root. Compiled to cuda_tile.sqrt.""" - @noinline sqrt(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline sqrt(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic sqrt(x::AbstractFloat) +@intrinsic sqrt(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.sqrt), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) cb = ctx.cb @@ -227,21 +195,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.sqrt), args) end # cuda_tile.tan -@eval Intrinsics begin - """Element-wise tangent. Compiled to cuda_tile.tan.""" - @noinline tan(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline tan(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic tan(x::AbstractFloat) +@intrinsic tan(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.tan), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tan), args) emit_unop!(ctx, args, encode_TanOp!) end # cuda_tile.tanh -@eval Intrinsics begin - """Element-wise hyperbolic tangent. Compiled to cuda_tile.tanh.""" - @noinline tanh(x::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline tanh(tile::Tile{T, S}) where {T<:AbstractFloat, S} = compilerbarrier(:const, tile) -end +@intrinsic tanh(x::AbstractFloat) +@intrinsic tanh(x::Tile{<:AbstractFloat}) +tfunc(𝕃, ::typeof(Intrinsics.tanh), @nospecialize(x)) = CC.widenconst(x) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.tanh), args) emit_unop!(ctx, args, encode_TanHOp!) end diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl index 1d42ad5..d4d4f87 100644 --- a/src/compiler/intrinsics/memory.jl +++ b/src/compiler/intrinsics/memory.jl @@ -3,23 +3,15 @@ # TODO: cuda_tile.join_tokens # cuda_tile.load_ptr_tko -@eval Intrinsics begin - """ - load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing) - - Load values from a tile of pointers. - If mask is provided, masked-out positions return the padding value. - Compiled to cuda_tile.load_ptr_tko. - - Note: TMA (allow_tma) is not applicable for pointer-based loads as they - support irregular access patterns incompatible with TMA requirements. - """ - @noinline function load_ptr_tko(ptrs::Tile{Ptr{T}, S}, - latency::Union{Int, Nothing}=nothing, - mask::Union{Tile{Bool, S}, Nothing}=nothing, - padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S} - Tile{T, S}() - end +@intrinsic load_ptr_tko(ptrs, latency=nothing, mask=nothing, padding=nothing) +function tfunc(𝕃, ::typeof(Intrinsics.load_ptr_tko), @nospecialize(ptrs), @nospecialize args...) + ptrs_type = CC.widenconst(ptrs) + ptrs_type <: Tile || return nothing + ptr_type = eltype(ptrs_type) + ptr_type <: Ptr || return nothing + T = eltype(ptr_type) + S = ptrs_type.parameters[2] + return Tile{T, S} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args) cb = ctx.cb @@ -81,24 +73,12 @@ end # TODO: cuda_tile.make_token # cuda_tile.store_ptr_tko -@eval Intrinsics begin - """ - store_ptr_tko(ptrs, values, latency, mask=nothing) - - Store values to a tile of pointers. - If mask is provided, masked-out positions are not written. - Compiled to cuda_tile.store_ptr_tko. - - Note: TMA (allow_tma) is not applicable for pointer-based stores as they - support irregular access patterns incompatible with TMA requirements. - """ - @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, - latency::Union{Int, Nothing}, - mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} - donotdelete() - nothing - end -end +@intrinsic store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, + latency::Union{Int, Nothing}, + mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} +tfunc(𝕃, ::typeof(Intrinsics.store_ptr_tko), @nospecialize args...) = Nothing +efunc(::typeof(Intrinsics.store_ptr_tko), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args) cb = ctx.cb tt = ctx.tt diff --git a/src/compiler/intrinsics/misc.jl b/src/compiler/intrinsics/misc.jl index 0b9f332..fa1c4ba 100644 --- a/src/compiler/intrinsics/misc.jl +++ b/src/compiler/intrinsics/misc.jl @@ -1,12 +1,10 @@ # miscellaneous intrinsics # cuda_tile.assert -@eval Intrinsics begin - @noinline function assert(cond::Bool, message::String) - donotdelete(cond, message) - nothing - end -end +@intrinsic assert(cond::Bool, message::String) +tfunc(𝕃, ::typeof(Intrinsics.assert), @nospecialize(cond), @nospecialize(message)) = Nothing +efunc(::typeof(Intrinsics.assert), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.assert), args) cond = @something emit_value!(ctx, args[1]) throw(IRError("assert: cannot resolve condition")) message = @something get_constant(ctx, args[2]) throw(IRError("assert: requires constant message")) diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index c8f1a88..fff19b1 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -24,17 +24,8 @@ function get_padding_value(ctx::CGCtx, args) end # cuda_tile.get_index_space_shape -@eval Intrinsics begin - """ - get_index_space_shape(pv::PartitionView, axis) -> Int32 - - Get the number of tiles along the given axis (0-indexed). - Compiled to cuda_tile.get_index_space_shape. - """ - @noinline function get_index_space_shape(pv::PartitionView{T, N, Shape}, axis::Integer) where {T, N, Shape} - compilerbarrier(:const, zero(Int32)) - end -end +@intrinsic get_index_space_shape(pv, axis) +tfunc(𝕃, ::typeof(Intrinsics.get_index_space_shape), @nospecialize(pv), @nospecialize(axis)) = Int32 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.get_index_space_shape), args) cb = ctx.cb tt = ctx.tt @@ -69,23 +60,9 @@ end # TODO: cuda_tile.get_tensor_shape # cuda_tile.load_view_tko -@eval Intrinsics begin - """ - load_partition_view(pv::PartitionView, latency, allow_tma, index...) -> Tile - - Load a tile from a partition view at the given 0-indexed tile coordinates. - Compiled to cuda_tile.load_view_tko. - """ - @noinline function load_partition_view(pv::PartitionView{T, N, Shape}, - latency::Union{Int, Nothing}, - allow_tma::Bool, - indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any}) - length(argtypes) >= 2 || return nothing - pv_type = CC.widenconst(argtypes[2]) +@intrinsic load_partition_view(pv, latency, allow_tma, indices) +function tfunc(𝕃, ::typeof(Intrinsics.load_partition_view), @nospecialize(pv), @nospecialize args...) + pv_type = CC.widenconst(pv) pv_type <: PartitionView || return nothing pv_type isa DataType || return nothing length(pv_type.parameters) >= 3 || return nothing @@ -172,24 +149,10 @@ function pad_indices(ctx::CGCtx, index_vals::Vector{Value}, ndim::Int, idx_type: end # cuda_tile.make_partition_view -@eval Intrinsics begin - """ - make_partition_view(tv::TensorView, shape_val, padding_mode, order) -> PartitionView - - Create a PartitionView from a TensorView with the given tile shape. - The `order` parameter (NTuple{N,Int} or nothing) specifies - the logical-to-physical dimension mapping (1-indexed), or identity if nothing. - Compiled to cuda_tile.make_partition_view. - """ - @noinline function make_partition_view(tv::TensorView{T, N}, shape::NTuple{M, Int}, padding_mode::Int, order) where {T, N, M} - compilerbarrier(:type, nothing) - end -end -function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any}) - length(argtypes) >= 3 || return nothing - tv_type = CC.widenconst(argtypes[2]) +@intrinsic make_partition_view(tv, shape, padding_mode, order) +function tfunc(𝕃, ::typeof(Intrinsics.make_partition_view), @nospecialize(tv), @nospecialize(shape_arg), @nospecialize args...) + tv_type = CC.widenconst(tv) tv_type <: TensorView || return nothing - shape_arg = argtypes[3] isa(shape_arg, CC.Const) || return nothing shape = shape_arg.val T = eltype(tv_type) @@ -336,16 +299,11 @@ function filter_dynamic_strides(stride_vals::Vector{Value}, tv_strides::Vector{I end # cuda_tile.make_tensor_view -@eval Intrinsics begin - """ - make_tensor_view(arr::TileArray) -> TensorView - - Create a TensorView from a TileArray. - Compiled to cuda_tile.make_tensor_view. - """ - @noinline function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N} - TensorView{T, N}() - end +@intrinsic make_tensor_view(arr::TileArray{T, N}) where {T, N} +function tfunc(𝕃, ::typeof(Intrinsics.make_tensor_view), @nospecialize(arr)) + t = CC.widenconst(arr) + t <: TileArray || return nothing + TensorView{eltype(t), ndims(t)} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args) array_arg = args[1] @@ -366,22 +324,14 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_tensor_view), args end # cuda_tile.store_view_tko -@eval Intrinsics begin - """ - store_partition_view(pv::PartitionView, tile, latency, allow_tma, index...) -> Nothing - - Store a tile to a partition view at the given 0-indexed tile coordinates. - Compiled to cuda_tile.store_view_tko. - """ - @noinline function store_partition_view(pv::PartitionView{T, N, Shape}, - tile::Tile{T}, - latency::Union{Int, Nothing}, - allow_tma::Bool, - indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - donotdelete() - nothing - end -end +@intrinsic store_partition_view(pv::PartitionView{T, N, Shape}, + tile::Tile{T}, + latency::Union{Int, Nothing}, + allow_tma::Bool, + indices::NTuple{M, <:Integer}) where {T, N, Shape, M} +tfunc(𝕃, ::typeof(Intrinsics.store_partition_view), @nospecialize args...) = Nothing +efunc(::typeof(Intrinsics.store_partition_view), effects::CC.Effects) = + CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_partition_view), args) cb = ctx.cb tt = ctx.tt diff --git a/test/Project.toml b/test/Project.toml index 278b9d8..bd30c97 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -8,9 +8,6 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -[sources] -FileCheck = {url = "https://github.com/JuliaLLVM/FileCheck.jl", rev = "main"} - [compat] FileCheck = "1.0" ParallelTestRunner = "2.0" diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl index 2e0cb80..a1503dc 100644 --- a/test/codegen/integration.jl +++ b/test/codegen/integration.jl @@ -688,6 +688,52 @@ end end end end + + @testset "float constant addition folds through addf" begin + @test @filecheck begin + @check_label "entry" + @check_not "addf" + @check "constant