diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 85addb13aef8d..11d129a5ca217 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -21,6 +21,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" @@ -60,6 +61,10 @@ static cl::opt UseCostHeur( "Experimentally, results are mixed, so this should be set on a " "case-by-case basis.")); +static cl::opt DisableMfmaChainOrderingDeps( + "amdgpu-disable-mfma-chain-order-deps", cl::init(false), cl::Hidden, + cl::desc("Enable artificial false dependencies between MFMA chains")); + // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. enum class SchedGroupMask { @@ -2342,6 +2347,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { // Add DAG edges that enforce SCHED_BARRIER ordering. void addSchedBarrierEdges(SUnit &SU); + // Add artificial false-dependencies between MFMA consumers of adjacent + // DS_READ_B128 streams to enforce MFMA(newer) -> MFMA(older-last) ordering. + void addMfmaFalseDeps(); + // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should // not be reordered accross the SCHED_BARRIER. This is used for the base // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that @@ -2585,6 +2594,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } } + if (!DisableMfmaChainOrderingDeps && ST.hasMAIInsts()) + addMfmaFalseDeps(); + if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); // PipelineSolver performs the mutation by adding the edges it @@ -2681,6 +2693,88 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { } // namespace +void IGroupLPDAGMutation::addMfmaFalseDeps() { + DenseMap MFMAAncestor; + SmallVector MFMAChainLeaders; + DenseMap MFMAChainNext; + for (auto &SU : DAG->SUnits) { + if (!TII->isMFMAorWMMA(*SU.getInstr())) + continue; + + if (MFMAAncestor.contains(&SU)) + continue; + + SUnit *CurrMFMA = &SU; + MFMAAncestor[CurrMFMA] = CurrMFMA; + MFMAChainLeaders.push_back(&SU); + while (!CurrMFMA->Succs.empty()) { + // Count the number of successor MFMA/WMMA instructions of + // the current MFMA instruction. + SUnit *NextMFMA = nullptr; + unsigned MFMADataDepSuccCount = 0; + for (const auto &Succ : CurrMFMA->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + if (!SuccSU->isInstr() || !TII->isMFMAorWMMA(*SuccSU->getInstr())) + continue; + + // Check if the successor is MFMA/WMMA and the edge is a data dependency + if (Succ.getKind() == SDep::Data) { + NextMFMA = SuccSU; + MFMADataDepSuccCount++; + } + } + + // If the current MFMA instruction has more than one successor MFMA/WMMA + // instruction, we need to break the chain. + if (MFMADataDepSuccCount != 1) { + MFMAChainNext[CurrMFMA] = nullptr; + break; + } + + // Add the current MFMA instruction to the MFMAAncestor map. + MFMAAncestor[CurrMFMA] = &SU; + MFMAChainNext[CurrMFMA] = NextMFMA; + CurrMFMA = NextMFMA; + } + } + + // Compute the tail and length of each chain in a single loop. + auto GetTailAndLength = [&](SUnit *Leader) -> std::pair { + unsigned Length = 1; + SUnit *Curr = Leader; + while (MFMAChainNext.count(Curr)) { + if (!MFMAChainNext[Curr]) + break; + Curr = MFMAChainNext[Curr]; + ++Length; + } + return {Curr, Length}; + }; + + // Assert that all MFMA chains are ordered by NodeNum + // Add artificial false dependencies between MFMA chains if two given + // chains are at least 2 SUs long. + // Iterate over all pairs of contiguous MFMA chains and add artificial edges + // if chains are at least 2 SUs long. + for (size_t I = 0; I + 1 < MFMAChainLeaders.size(); ++I) { + SUnit *ChainLeaderA = MFMAChainLeaders[I]; + SUnit *ChainLeaderB = MFMAChainLeaders[I + 1]; + + auto [TailA, LengthA] = GetTailAndLength(ChainLeaderA); + auto [TailB, LengthB] = GetTailAndLength(ChainLeaderB); + + // Only add if both chains are at least two SUs long. + if (LengthA >= 2 && LengthB >= 2) { + // Add an artificial dependency edge from the tail of chain A to the + // leader of chain B. + LLVM_DEBUG(dbgs() << "Adding artificial dependency edge from " + << TailA->NodeNum << " to " << ChainLeaderB->NodeNum + << "\n"); + DAG->addEdge(ChainLeaderB, SDep(TailA, SDep::Artificial)); + } + } +} + /// \p Phase specifes whether or not this is a reentry into the /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the /// same scheduling region (e.g. pre and post-RA scheduling / multiple diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index 689d1472d6010..a2a00d107a7bc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -6,143 +6,143 @@ define amdgpu_kernel void @largeInterleave() #0 { ret void } ; GCN-LABEL: largeInterleave: ; GCN: ; %bb.0: - ; GCN-NEXT: ; implicit-def: $vgpr16 - ; GCN-NEXT: ; implicit-def: $vgpr25 + ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - ; GCN-NEXT: v_readfirstlane_b32 s17, v16 + ; GCN-NEXT: v_readfirstlane_b32 s17, v0 + ; GCN-NEXT: ; implicit-def: $vgpr2 + ; GCN-NEXT: s_lshl_b32 s18, s17, 7 + ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: v_add_lshl_u32 v231, v2, s18, 1 + ; GCN-NEXT: v_add_u32_e32 v2, s17, v4 + ; GCN-NEXT: v_and_b32_e32 v2, 0x1fffffff, v2 + ; GCN-NEXT: ; implicit-def: $sgpr16 + ; GCN-NEXT: v_mul_lo_u32 v2, v2, s16 + ; GCN-NEXT: ; implicit-def: $vgpr5 + ; GCN-NEXT: ; implicit-def: $vgpr9 + ; GCN-NEXT: v_add_lshl_u32 v172, v5, v2, 1 + ; GCN-NEXT: v_lshl_add_u32 v2, s17, 4, v9 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr17 + ; GCN-NEXT: v_mul_lo_u32 v2, v2, s6 + ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: v_add_lshl_u32 v224, v2, v1, 1 ; GCN-NEXT: ; implicit-def: $sgpr15 + ; GCN-NEXT: v_add_u32_e32 v1, s15, v224 ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN-NEXT: s_lshl_b32 s18, s17, 7 - ; GCN-NEXT: ; implicit-def: $vgpr18 - ; GCN-NEXT: v_add_lshl_u32 v230, v18, s18, 1 - ; GCN-NEXT: v_lshl_add_u32 v25, s17, 4, v25 - ; GCN-NEXT: v_mul_lo_u32 v25, v25, s6 - ; GCN-NEXT: v_add_lshl_u32 v226, v25, v17, 1 - ; GCN-NEXT: v_add_u32_e32 v17, s15, v226 - ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[64:67], v224, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v17, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[68:71], v1, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_add_u32_e32 v72, 64, v17 - ; GCN-NEXT: ; implicit-def: $vgpr213 + ; GCN-NEXT: v_add_u32_e32 v72, 64, v1 + ; GCN-NEXT: ; implicit-def: $vgpr230 ; GCN-NEXT: ; implicit-def: $vgpr152_vgpr153_vgpr154_vgpr155 - ; GCN-NEXT: ; implicit-def: $vgpr246 - ; GCN-NEXT: v_add_u32_e32 v188, 0x80, v17 + ; GCN-NEXT: ; implicit-def: $vgpr240 + ; GCN-NEXT: v_add_u32_e32 v208, 0x80, v1 ; GCN-NEXT: ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159 ; GCN-NEXT: ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147 - ; GCN-NEXT: ; implicit-def: $vgpr19 - ; GCN-NEXT: ; implicit-def: $vgpr26 - ; GCN-NEXT: ; implicit-def: $vgpr27 - ; GCN-NEXT: v_add_u32_e32 v227, 0xc0, v17 - ; GCN-NEXT: v_add_u32_e32 v231, v19, v26 - ; GCN-NEXT: v_add_u32_e32 v232, v19, v27 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr28 - ; GCN-NEXT: ; implicit-def: $vgpr29 - ; GCN-NEXT: v_add_u32_e32 v233, v19, v28 - ; GCN-NEXT: v_add_u32_e32 v234, v19, v29 + ; GCN-NEXT: ; implicit-def: $vgpr3 + ; GCN-NEXT: ; implicit-def: $vgpr10 + ; GCN-NEXT: ; implicit-def: $vgpr11 + ; GCN-NEXT: ; implicit-def: $vgpr12 + ; GCN-NEXT: ; implicit-def: $vgpr13 + ; GCN-NEXT: v_add_u32_e32 v225, 0xc0, v1 + ; GCN-NEXT: v_add_u32_e32 v226, v3, v10 + ; GCN-NEXT: v_add_u32_e32 v227, v3, v11 + ; GCN-NEXT: v_add_u32_e32 v228, v3, v12 + ; GCN-NEXT: v_add_u32_e32 v232, v3, v13 ; GCN-NEXT: ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: ; implicit-def: $sgpr7 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151 ; GCN-NEXT: ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139 + ; GCN-NEXT: ; implicit-def: $sgpr5 + ; GCN-NEXT: ; implicit-def: $sgpr7 ; GCN-NEXT: ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: v_add_u32_e32 v18, s17, v20 - ; GCN-NEXT: v_and_b32_e32 v18, 0x1fffffff, v18 - ; GCN-NEXT: ; implicit-def: $sgpr16 - ; GCN-NEXT: v_mul_lo_u32 v18, v18, s16 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: v_add_lshl_u32 v199, v21, v18, 1 - ; GCN-NEXT: ; implicit-def: $vgpr22 - ; GCN-NEXT: v_lshl_add_u32 v200, v22, 1, v199 - ; GCN-NEXT: ; implicit-def: $vgpr23 - ; GCN-NEXT: v_lshl_add_u32 v201, v23, 1, v200 - ; GCN-NEXT: ; implicit-def: $vgpr24 - ; GCN-NEXT: v_lshl_add_u32 v202, v24, 1, v201 - ; GCN-NEXT: ; implicit-def: $vgpr16 - ; GCN-NEXT: ; implicit-def: $vgpr18 - ; GCN-NEXT: ; implicit-def: $vgpr20 - ; GCN-NEXT: ; implicit-def: $vgpr24 - ; GCN-NEXT: v_add_u32_e32 v247, v19, v24 - ; GCN-NEXT: v_add_u32_e32 v248, v19, v16 - ; GCN-NEXT: v_add_u32_e32 v249, v19, v18 - ; GCN-NEXT: v_add_u32_e32 v250, v19, v20 ; GCN-NEXT: ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131 ; GCN-NEXT: ; implicit-def: $sgpr14 - ; GCN-NEXT: ; implicit-def: $vgpr196 + ; GCN-NEXT: ; implicit-def: $vgpr6 + ; GCN-NEXT: v_lshl_add_u32 v173, v6, 1, v172 + ; GCN-NEXT: ; implicit-def: $vgpr7 + ; GCN-NEXT: v_lshl_add_u32 v174, v7, 1, v173 + ; GCN-NEXT: ; implicit-def: $vgpr8 + ; GCN-NEXT: v_lshl_add_u32 v175, v8, 1, v174 + ; GCN-NEXT: ; implicit-def: $vgpr0 + ; GCN-NEXT: ; implicit-def: $vgpr2 + ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: ; implicit-def: $vgpr170 + ; GCN-NEXT: v_add_u32_e32 v242, v3, v170 + ; GCN-NEXT: v_add_u32_e32 v243, v3, v0 + ; GCN-NEXT: v_add_u32_e32 v244, v3, v2 + ; GCN-NEXT: v_add_u32_e32 v245, v3, v4 + ; GCN-NEXT: ; implicit-def: $vgpr160 ; GCN-NEXT: ; implicit-def: $sgpr12_sgpr13 - ; GCN-NEXT: ; implicit-def: $vgpr211 - ; GCN-NEXT: v_max_f32_e32 v212, v211, v211 - ; GCN-NEXT: ; implicit-def: $vgpr198 - ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN-NEXT: ; implicit-def: $vgpr32 - ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: ; implicit-def: $vgpr34 - ; GCN-NEXT: v_add_u32_e32 v210, v19, v34 - ; GCN-NEXT: v_add_u32_e32 v206, v19, v33 - ; GCN-NEXT: v_add_u32_e32 v205, v19, v32 - ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; GCN-NEXT: ; implicit-def: $vgpr21 - ; GCN-NEXT: ; implicit-def: $vgpr22 - ; GCN-NEXT: ; implicit-def: $vgpr23 - ; GCN-NEXT: ; implicit-def: $vgpr30 - ; GCN-NEXT: ; implicit-def: $vgpr31 - ; GCN-NEXT: v_add_u32_e32 v207, v19, v21 - ; GCN-NEXT: v_add_u32_e32 v208, v19, v22 - ; GCN-NEXT: v_add_u32_e32 v209, v19, v23 - ; GCN-NEXT: v_add_u32_e32 v203, v19, v30 - ; GCN-NEXT: v_add_u32_e32 v204, v19, v31 - ; GCN-NEXT: ; kill: killed $vgpr17 + ; GCN-NEXT: ; implicit-def: $vgpr241 + ; GCN-NEXT: v_max_f32_e32 v246, v241, v241 + ; GCN-NEXT: ; implicit-def: $vgpr171 ; GCN-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: ; implicit-def: $vgpr161 + ; GCN-NEXT: ; implicit-def: $vgpr162 + ; GCN-NEXT: ; implicit-def: $vgpr163 + ; GCN-NEXT: ; implicit-def: $vgpr165 + ; GCN-NEXT: v_add_u32_e32 v247, v3, v165 + ; GCN-NEXT: v_add_u32_e32 v248, v3, v161 + ; GCN-NEXT: v_add_u32_e32 v249, v3, v162 + ; GCN-NEXT: v_add_u32_e32 v250, v3, v163 + ; GCN-NEXT: ; implicit-def: $vgpr164 + ; GCN-NEXT: ; implicit-def: $vgpr166 + ; GCN-NEXT: ; implicit-def: $vgpr167 + ; GCN-NEXT: ; implicit-def: $vgpr168 + ; GCN-NEXT: v_add_u32_e32 v176, v3, v168 + ; GCN-NEXT: v_add_u32_e32 v177, v3, v164 + ; GCN-NEXT: v_add_u32_e32 v178, v3, v166 + ; GCN-NEXT: v_add_u32_e32 v179, v3, v167 + ; GCN-NEXT: ; kill: killed $vgpr1 + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; GCN-NEXT: ; implicit-def: $vgpr197 + ; GCN-NEXT: ; implicit-def: $vgpr169 ; GCN-NEXT: ; iglp_opt mask(0x00000002) ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[64:67] + ; GCN-NEXT: ds_write_b128 v231, v[64:67] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[68:71] offset:1024 + ; GCN-NEXT: ds_write_b128 v231, v[68:71] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v226, s[8:11], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[180:183], v224, s[8:11], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[164:167], v72, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[184:187], v72, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[64:67], v213 + ; GCN-NEXT: ds_read_b128 v[64:67], v230 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[64:65], v[152:153], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[66:67], v[154:155], v[112:127] - ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[64:67], v230 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[64:65], v[152:153], 0 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[66:67], v[154:155], v[96:111] - ; GCN-NEXT: ds_read_b128 v[64:67], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[64:67], v230 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[168:171], v213 offset:1536 + ; GCN-NEXT: ds_read_b128 v[188:191], v230 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[172:175], v246 + ; GCN-NEXT: ds_read_b128 v[192:195], v240 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[196:199], v240 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[180:183], v246 offset:1024 + ; GCN-NEXT: ds_read_b128 v[200:203], v240 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1536 + ; GCN-NEXT: ds_read_b128 v[204:207], v240 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART @@ -150,293 +150,293 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[64:65], v[152:153], 0 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[160:163] + ; GCN-NEXT: ds_write_b128 v231, v[180:183] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[66:67], v[154:155], v[80:95] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[164:167] offset:1024 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[168:169], v[152:153], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[170:171], v[154:155], v[64:79] + ; GCN-NEXT: ds_write_b128 v231, v[184:187] offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[152:153], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[154:155], v[64:79] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:128 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v224, s[8:11], 0 offen offset:128 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx4 v[160:163], v188, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[180:183], v208, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[188:191], v213 + ; GCN-NEXT: ds_read_b128 v[184:187], v230 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[192:195], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[188:191], v230 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[164:167], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[208:211], v230 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[214:217], v213 offset:1536 + ; GCN-NEXT: ds_read_b128 v[212:215], v230 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[172:173], v[156:157], v[112:127] - ; GCN-NEXT: ds_read_b128 v[218:221], v246 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[192:193], v[156:157], v[112:127] + ; GCN-NEXT: ds_read_b128 v[216:219], v240 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[222:225], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[220:223], v240 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[168:171], v246 offset:1024 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[194:195], v[158:159], v[112:127] + ; GCN-NEXT: ds_read_b128 v[192:195], v240 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[174:175], v[158:159], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[188:189], v[144:145], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[190:191], v[146:147], v[112:127] - ; GCN-NEXT: ds_read_b128 v[188:191], v246 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[144:145], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[146:147], v[112:127] + ; GCN-NEXT: ds_read_b128 v[184:187], v240 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[152:155] + ; GCN-NEXT: ds_write_b128 v231, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[160:163] offset:1024 + ; GCN-NEXT: ds_write_b128 v231, v[180:183] offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[152:155], v224, s[8:11], 0 offen offset:192 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79] - ; GCN-NEXT: buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[180:183], v225, s[8:11], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[216:217], v[140:141], v[112:127] + ; GCN-NEXT: buffer_load_dwordx2 v[224:225], v226, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[226:227], v227, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_perm_b32 v234, v226, v224, s7 + ; GCN-NEXT: buffer_load_dwordx2 v[228:229], v228, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[204:205], v[156:157], v[64:79] + ; GCN-NEXT: buffer_load_dwordx2 v[216:217], v232, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79] - ; GCN-NEXT: v_perm_b32 v238, v162, v160, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127] - ; GCN-NEXT: v_perm_b32 v240, v162, v160, s7 - ; GCN-NEXT: v_perm_b32 v242, v163, v161, s5 - ; GCN-NEXT: v_perm_b32 v244, v163, v161, s7 - ; GCN-NEXT: ds_read_b128 v[160:163], v213 + ; GCN-NEXT: v_perm_b32 v232, v226, v224, s5 + ; GCN-NEXT: v_perm_b32 v236, v227, v225, s5 + ; GCN-NEXT: v_perm_b32 v238, v227, v225, s7 + ; GCN-NEXT: v_perm_b32 v233, v216, v228, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[156:157], v[96:111] + ; GCN-NEXT: v_perm_b32 v235, v216, v228, s7 + ; GCN-NEXT: v_perm_b32 v237, v217, v229, s5 + ; GCN-NEXT: v_perm_b32 v239, v217, v229, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[206:207], v[158:159], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[158:159], v[96:111] + ; GCN-NEXT: ds_read_b128 v[196:199], v230 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_perm_b32 v239, v174, v172, s5 - ; GCN-NEXT: v_perm_b32 v241, v174, v172, s7 - ; GCN-NEXT: v_perm_b32 v243, v175, v173, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[144:145], v[64:79] - ; GCN-NEXT: v_perm_b32 v245, v175, v173, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[176:177], v[156:157], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[220:221], v[142:143], v[112:127] - ; GCN-NEXT: ds_read_b128 v[218:221], v213 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[156:157], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[144:145], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[202:203], v[158:159], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[188:189], v[144:145], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[142:143], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[146:147], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[190:191], v[146:147], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[208:209], v[144:145], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[196:197], v[148:149], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[140:141], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[140:141], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[210:211], v[146:147], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[198:199], v[150:151], v[112:127] + ; GCN-NEXT: ds_read_b128 v[196:199], v230 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[172:175], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[204:207], v230 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[146:147], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[178:179], v[158:159], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[160:161], v[148:149], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[188:189], v[140:141], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[192:193], v[144:145], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[162:163], v[150:151], v[112:127] - ; GCN-NEXT: ds_read_b128 v[160:163], v213 offset:1536 + ; GCN-NEXT: ds_read_b128 v[212:215], v230 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[184:187], v246 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[142:143], v[64:79] + ; GCN-NEXT: ds_read_b128 v[184:187], v240 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[214:217], v246 offset:512 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[142:143], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[192:193], v[140:141], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[212:213], v[148:149], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[196:197], v[148:149], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[194:195], v[142:143], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[214:215], v[150:151], v[64:79] + ; GCN-NEXT: ds_read_b128 v[212:215], v240 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[176:179], v246 offset:1024 + ; GCN-NEXT: ds_read_b128 v[200:203], v240 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[190:191], v[142:143], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[194:195], v[146:147], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[148:149], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[156:157], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[222:223], v[140:141], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[150:151], v[64:79] - ; GCN-NEXT: ds_read_b128 v[160:163], v246 offset:1536 + ; GCN-NEXT: ds_read_b128 v[216:219], v240 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v230, v[152:155] + ; GCN-NEXT: ds_write_b128 v231, v[152:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b128 v230, v[226:229] offset:1024 + ; GCN-NEXT: ds_write_b128 v231, v[180:183] offset:1024 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[158:159], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[184:185], v[136:137], v[112:127] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[156:159], v213 + ; GCN-NEXT: ds_read_b128 v[152:155], v230 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[226:229], v213 offset:512 + ; GCN-NEXT: ds_read_b128 v[180:183], v230 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[180:183], v213 offset:1024 + ; GCN-NEXT: ds_read_b128 v[224:227], v230 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[152:155], v213 offset:1536 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[198:199], v[150:151], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[204:205], v[148:149], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] + ; GCN-NEXT: ds_read_b128 v[184:187], v230 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[230:233], v246 + ; GCN-NEXT: ds_read_b128 v[228:231], v240 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[234:237], v246 offset:512 + ; GCN-NEXT: ds_read_b128 v[188:191], v240 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[186:187], v[138:139], v[112:127] - ; GCN-NEXT: ds_read_b128 v[184:187], v246 offset:1024 + ; GCN-NEXT: ds_read_b128 v[220:223], v240 offset:1024 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[224:225], v[142:143], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[156:157], v[132:133], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[218:219], v[148:149], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[158:159], v[134:135], v[112:127] - ; GCN-NEXT: ds_read_b128 v[156:159], v246 offset:1536 + ; GCN-NEXT: ds_read_b128 v[196:199], v240 offset:1536 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[238:239] + ; GCN-NEXT: ds_write_b64 v172, v[232:233] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[212:213], v[136:137], v[96:111] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[240:241] + ; GCN-NEXT: ds_write_b64 v173, v[234:235] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[242:243] + ; GCN-NEXT: ds_write_b64 v174, v[236:237] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[244:245] + ; GCN-NEXT: ds_write_b64 v175, v[238:239] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[192:193], v247, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[156:157], v242, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[220:221], v[150:151], v[96:111] - ; GCN-NEXT: buffer_load_dwordx2 v[194:195], v248, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[158:159], v243, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[218:219], v249, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[232:233], v244, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[220:221], v250, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[234:235], v245, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[206:207], v[150:151], v[80:95] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_perm_b32 v188, v194, v192, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[164:165], v[144:145], v[80:95] - ; GCN-NEXT: v_perm_b32 v189, v220, v218, s5 - ; GCN-NEXT: v_perm_b32 v191, v220, v218, s7 - ; GCN-NEXT: v_perm_b32 v190, v194, v192, s7 - ; GCN-NEXT: v_perm_b32 v192, v195, v193, s5 - ; GCN-NEXT: v_perm_b32 v194, v195, v193, s7 - ; GCN-NEXT: v_perm_b32 v193, v221, v219, s5 - ; GCN-NEXT: v_perm_b32 v195, v221, v219, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[166:167], v[146:147], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[168:169], v[140:141], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[170:171], v[142:143], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[172:173], v[148:149], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[136:137], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[174:175], v[150:151], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[216:217], v[138:139], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[176:177], v[136:137], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[226:227], v[132:133], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[178:179], v[138:139], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[160:161], v[136:137], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[128:129], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[228:229], v[134:135], v[96:111] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[180:181], v[132:133], v[80:95] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[162:163], v[138:139], v[64:79] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[232:233], v[130:131], v[112:127] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[234:235], v[128:129], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[132:133], v[112:127] + ; GCN-NEXT: v_perm_b32 v152, v158, v156, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[214:215], v[138:139], v[96:111] + ; GCN-NEXT: v_perm_b32 v153, v234, v232, s5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[200:201], v[136:137], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[134:135], v[112:127] + ; GCN-NEXT: v_perm_b32 v154, v158, v156, s7 + ; GCN-NEXT: v_perm_b32 v155, v234, v232, s7 + ; GCN-NEXT: v_perm_b32 v156, v159, v157, s5 + ; GCN-NEXT: v_perm_b32 v158, v159, v157, s7 + ; GCN-NEXT: v_perm_b32 v157, v235, v233, s5 + ; GCN-NEXT: v_perm_b32 v159, v235, v233, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[180:181], v[132:133], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[202:203], v[138:139], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[216:217], v[136:137], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[228:229], v[128:129], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[182:183], v[134:135], v[96:111] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[224:225], v[132:133], v[80:95] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[218:219], v[138:139], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[230:231], v[130:131], v[112:127] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[188:189], v[128:129], v[96:111] ; GCN-NEXT: s_nop 9 - ; GCN-NEXT: v_mul_f32_e32 v213, s4, v112 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v113 - ; GCN-NEXT: v_max3_f32 v213, v213, s14, v218 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v114 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v115 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v116 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[182:183], v[134:135], v[80:95] - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v117 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v118 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v119 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v120 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v121 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[152:153], v[132:133], v[64:79] - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v122 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v123 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v124 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v125 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[236:237], v[130:131], v[96:111] - ; GCN-NEXT: v_mul_f32_e32 v218, s4, v126 - ; GCN-NEXT: v_mul_f32_e32 v219, s4, v127 - ; GCN-NEXT: v_max3_f32 v213, v213, v218, v219 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[184:185], v[128:129], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v228, s4, v112 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v113 + ; GCN-NEXT: v_max3_f32 v228, v228, s14, v229 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v114 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v115 + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v116 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[226:227], v[134:135], v[80:95] + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v117 + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v118 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v119 + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v120 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v121 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[132:133], v[64:79] + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v122 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v123 + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v124 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v125 + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[190:191], v[130:131], v[96:111] + ; GCN-NEXT: v_mul_f32_e32 v229, s4, v126 + ; GCN-NEXT: v_mul_f32_e32 v230, s4, v127 + ; GCN-NEXT: v_max3_f32 v228, v228, v229, v230 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[220:221], v[128:129], v[80:95] ; GCN-NEXT: s_nop 6 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v96 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v97 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v98 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v99 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v100 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[154:155], v[134:135], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v101 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v102 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v103 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v104 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v105 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[186:187], v[130:131], v[80:95] - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v106 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v107 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v108 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v109 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[156:157], v[128:129], v[64:79] - ; GCN-NEXT: v_mul_f32_e32 v214, s4, v110 - ; GCN-NEXT: v_mul_f32_e32 v215, s4, v111 - ; GCN-NEXT: v_max3_f32 v213, v213, v214, v215 + ; GCN-NEXT: v_mul_f32_e32 v180, s4, v96 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v97 + ; GCN-NEXT: v_max3_f32 v180, v228, v180, v181 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v98 + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v99 + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v100 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[134:135], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v101 + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v102 + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v103 + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v104 + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v105 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[222:223], v[130:131], v[80:95] + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v106 + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v107 + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v108 + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v109 + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[196:197], v[128:129], v[64:79] + ; GCN-NEXT: v_mul_f32_e32 v181, s4, v110 + ; GCN-NEXT: v_mul_f32_e32 v182, s4, v111 + ; GCN-NEXT: v_max3_f32 v180, v180, v181, v182 ; GCN-NEXT: v_mul_f32_e32 v140, s4, v80 ; GCN-NEXT: v_mul_f32_e32 v141, s4, v81 - ; GCN-NEXT: v_max3_f32 v140, v213, v140, v141 + ; GCN-NEXT: v_max3_f32 v140, v180, v140, v141 ; GCN-NEXT: v_mul_f32_e32 v141, s4, v82 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[158:159], v[130:131], v[64:79] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[198:199], v[130:131], v[64:79] ; GCN-NEXT: v_mul_f32_e32 v142, s4, v83 ; GCN-NEXT: v_max3_f32 v140, v140, v141, v142 ; GCN-NEXT: v_mul_f32_e32 v141, s4, v84 @@ -481,41 +481,35 @@ ; GCN-NEXT: v_mul_f32_e32 v129, s4, v78 ; GCN-NEXT: v_mul_f32_e32 v130, s4, v79 ; GCN-NEXT: v_max3_f32 v128, v128, v129, v130 - ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: ds_bpermute_b32 v129, v160, v128 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[130:133], v198 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_max_f32_e32 v129, v129, v129 ; GCN-NEXT: v_max_f32_e32 v128, v128, v129 - ; GCN-NEXT: ds_bpermute_b32 v129, v196, v128 + ; GCN-NEXT: ds_bpermute_b32 v129, v160, v128 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cndmask_b32_e64 v128, v129, v128, s[12:13] ; GCN-NEXT: v_max_f32_e32 v128, v128, v128 - ; GCN-NEXT: v_max_f32_e32 v128, v212, v128 - ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v128 + ; GCN-NEXT: v_max_f32_e32 v137, v246, v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v113, -v137 ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v114, -v137 ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v115, -v137 ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v116, -v137 ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v117, -v137 ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v128 - ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v118, -v137 + ; GCN-NEXT: v_fma_f32 v112, s4, v112, -v137 ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v113 - ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v128 - ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v128 - ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v128 + ; GCN-NEXT: v_fma_f32 v113, s4, v119, -v137 + ; GCN-NEXT: v_fma_f32 v118, s4, v120, -v137 + ; GCN-NEXT: v_fma_f32 v120, s4, v121, -v137 ; GCN-NEXT: v_mul_f32_e32 v112, 0x3fb8aa3b, v112 ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v113 ; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v120 - ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v128 + ; GCN-NEXT: v_fma_f32 v120, s4, v122, -v137 ; GCN-NEXT: v_exp_f32_e32 v114, v138 ; GCN-NEXT: v_exp_f32_e32 v115, v139 ; GCN-NEXT: v_exp_f32_e32 v116, v140 @@ -526,37 +520,41 @@ ; GCN-NEXT: v_exp_f32_e32 v120, v144 ; GCN-NEXT: v_exp_f32_e32 v113, v112 ; GCN-NEXT: v_cvt_f16_f32_e32 v119, v114 - ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 - ; GCN-NEXT: v_sub_f32_e32 v129, v211, v128 + ; GCN-NEXT: v_sub_f32_e32 v128, v241, v137 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v113 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v129 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 + ; GCN-NEXT: ds_read_b128 v[128:131], v171 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v121, v116 + ; GCN-NEXT: ds_read_b128 v[132:135], v171 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v128 ; GCN-NEXT: v_pack_b32_f16 v146, v112, v119 ; GCN-NEXT: v_cvt_f16_f32_e32 v112, v115 + ; GCN-NEXT: ds_read_b128 v[138:141], v171 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v122, s4, v123, -v137 ; GCN-NEXT: v_mul_f32_e32 v151, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 - ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v128 ; GCN-NEXT: v_pack_b32_f16 v147, v112, v121 - ; GCN-NEXT: v_exp_f32_e32 v112, v129 + ; GCN-NEXT: v_exp_f32_e32 v112, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v123, v117 + ; GCN-NEXT: v_fma_f32 v122, s4, v124, -v137 ; GCN-NEXT: v_cvt_f16_f32_e32 v124, v118 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v122 - ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v128 - ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[24:25], v[24:25], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[146:147], v[16:31] ; GCN-NEXT: v_exp_f32_e32 v119, v143 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 + ; GCN-NEXT: ds_read_b128 v[142:145], v171 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[112:113] op_sel_hi:[1,0] @@ -565,18 +563,19 @@ ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[16:17], v[16:17], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[18:19], v[18:19], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[146:147], v[32:47] - ; GCN-NEXT: v_mul_f32_e64 v20, v20, v112 - ; GCN-NEXT: v_mul_f32_e64 v21, v21, v112 - ; GCN-NEXT: v_mul_f32_e64 v22, v22, v112 - ; GCN-NEXT: v_mul_f32_e64 v23, v23, v112 - ; GCN-NEXT: v_mul_f32_e64 v24, v24, v112 - ; GCN-NEXT: v_mul_f32_e64 v25, v25, v112 - ; GCN-NEXT: v_pk_mul_f32 v[26:27], v[26:27], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v121, v148 + ; GCN-NEXT: v_pack_b32_f16 v148, v123, v124 + ; GCN-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[112:113] op_sel_hi:[1,0] + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v119 + ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v137 ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[112:113] op_sel_hi:[1,0] @@ -585,568 +584,576 @@ ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[112:113] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[112:113] op_sel_hi:[1,0] - ; GCN-NEXT: v_pack_b32_f16 v134, v123, v124 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v119 - ; GCN-NEXT: v_fma_f32 v124, s4, v126, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v120 - ; GCN-NEXT: v_exp_f32_e32 v121, v148 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[146:147], v[16:31] + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v122 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[138:139], v[146:147], v[0:15] ; GCN-NEXT: v_exp_f32_e32 v122, v149 - ; GCN-NEXT: v_pack_b32_f16 v135, v130, v126 + ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v137 + ; GCN-NEXT: v_pack_b32_f16 v149, v128, v126 ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v124 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v121 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 - ; GCN-NEXT: v_fma_f32 v139, s4, v96, -v128 - ; GCN-NEXT: v_fma_f32 v127, s4, v127, -v128 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v127 + ; GCN-NEXT: ds_read_b128 v[126:129], v169 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v125, s4, v125, -v137 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[146:147], v[48:63] ; GCN-NEXT: v_exp_f32_e32 v123, v150 - ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_fma_f32 v143, s4, v101, -v128 - ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v128 - ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v128 - ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v128 - ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[134:135], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v121 + ; GCN-NEXT: v_fma_f32 v143, s4, v96, -v137 + ; GCN-NEXT: v_fma_f32 v64, s4, v64, -v137 + ; GCN-NEXT: v_fma_f32 v65, s4, v65, -v137 + ; GCN-NEXT: v_fma_f32 v68, s4, v68, -v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[130:131], v[148:149], v[16:31] ; GCN-NEXT: v_exp_f32_e32 v124, v151 - ; GCN-NEXT: ds_read_b128 v[130:133], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: ds_read_b128 v[130:133], v169 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v122 - ; GCN-NEXT: v_exp_f32_e32 v96, v129 - ; GCN-NEXT: v_fma_f32 v137, s4, v97, -v128 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v136 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v123 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] + ; GCN-NEXT: v_fma_f32 v69, s4, v69, -v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[148:149], v[32:47] + ; GCN-NEXT: v_fma_f32 v135, s4, v97, -v137 + ; GCN-NEXT: v_exp_f32_e32 v96, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v122 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_pack_b32_f16 v134, v142, v134 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[140:141], v[148:149], v[0:15] ; GCN-NEXT: v_exp_f32_e32 v97, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_fma_f32 v137, s4, v98, -v128 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v124 - ; GCN-NEXT: v_fma_f32 v135, s4, v99, -v128 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v123 + ; GCN-NEXT: v_fma_f32 v140, s4, v98, -v137 + ; GCN-NEXT: v_mul_f32_e32 v180, 0x3fb8aa3b, v140 + ; GCN-NEXT: v_fma_f32 v140, s4, v99, -v137 + ; GCN-NEXT: v_mul_f32_e32 v181, 0x3fb8aa3b, v140 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[148:149], v[48:63] ; GCN-NEXT: v_exp_f32_e32 v98, v138 - ; GCN-NEXT: v_exp_f32_e32 v99, v127 - ; GCN-NEXT: v_mul_f32_e32 v150, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_pack_b32_f16 v127, v136, v134 - ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v124 + ; GCN-NEXT: v_pack_b32_f16 v135, v135, v138 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v99, v139 + ; GCN-NEXT: ds_read_b128 v[138:141], v169 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: ds_read_b128 v[142:145], v169 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[126:127], v[0:15] - ; GCN-NEXT: v_fma_f32 v131, s4, v100, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v96 - ; GCN-NEXT: v_exp_f32_e32 v100, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v97 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[188:189] + ; GCN-NEXT: ds_write_b64 v172, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[190:191] + ; GCN-NEXT: ds_write_b64 v173, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[192:193] + ; GCN-NEXT: ds_write_b64 v174, v[156:157] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[194:195] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[126:127], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v101, v125 - ; GCN-NEXT: v_pack_b32_f16 v146, v130, v131 + ; GCN-NEXT: ds_write_b64 v175, v[158:159] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v210, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[126:127], v247, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v147, v98 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[126:127], v[16:31] - ; GCN-NEXT: v_fma_f32 v134, s4, v102, -v128 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v134 - ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v207, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v248, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v102, v142 - ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v208, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v249, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[144:145], v209, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[150:151], v250, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v96 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[130:131], v[134:135], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v97 + ; GCN-NEXT: v_fma_f32 v153, s4, v100, -v137 + ; GCN-NEXT: v_exp_f32_e32 v100, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v153 + ; GCN-NEXT: v_pack_b32_f16 v130, v152, v130 + ; GCN-NEXT: v_fma_f32 v131, s4, v101, -v137 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[126:127], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v99 - ; GCN-NEXT: v_fma_f32 v127, s4, v103, -v128 - ; GCN-NEXT: v_exp_f32_e32 v103, v150 - ; GCN-NEXT: v_fma_f32 v139, s4, v105, -v128 - ; GCN-NEXT: v_pack_b32_f16 v147, v147, v126 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[138:139], v[134:135], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v101, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_fma_f32 v131, s4, v103, -v137 + ; GCN-NEXT: v_fma_f32 v139, s4, v104, -v137 + ; GCN-NEXT: v_perm_b32 v156, v147, v127, s5 + ; GCN-NEXT: v_perm_b32 v152, v146, v126, s5 + ; GCN-NEXT: v_perm_b32 v153, v150, v148, s5 + ; GCN-NEXT: v_perm_b32 v155, v150, v148, s7 + ; GCN-NEXT: v_perm_b32 v150, v147, v127, s7 + ; GCN-NEXT: v_fma_f32 v127, s4, v102, -v137 + ; GCN-NEXT: v_perm_b32 v154, v146, v126, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v98 ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_perm_b32 v152, v135, v131, s5 - ; GCN-NEXT: v_perm_b32 v154, v135, v131, s7 - ; GCN-NEXT: v_fma_f32 v135, s4, v104, -v128 - ; GCN-NEXT: v_perm_b32 v126, v134, v130, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] - ; GCN-NEXT: v_perm_b32 v150, v134, v130, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v99 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[134:135], v[48:63] + ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_exp_f32_e32 v102, v180 + ; GCN-NEXT: v_pack_b32_f16 v131, v126, v127 ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v100 - ; GCN-NEXT: v_exp_f32_e32 v104, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v101 - ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: v_perm_b32 v157, v151, v149, s5 + ; GCN-NEXT: v_perm_b32 v151, v151, v149, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[130:131], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v103, v181 + ; GCN-NEXT: ds_read_b128 v[126:129], v171 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_perm_b32 v127, v144, v142, s5 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] - ; GCN-NEXT: v_pack_b32_f16 v148, v134, v135 - ; GCN-NEXT: v_fma_f32 v135, s4, v106, -v128 - ; GCN-NEXT: v_exp_f32_e32 v105, v125 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v102 - ; GCN-NEXT: v_perm_b32 v151, v144, v142, s7 - ; GCN-NEXT: v_perm_b32 v153, v145, v143, s5 - ; GCN-NEXT: v_perm_b32 v155, v145, v143, s7 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[146:147], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v106, v156 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v103 - ; GCN-NEXT: v_fma_f32 v136, s4, v107, -v128 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_pack_b32_f16 v149, v134, v135 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v136 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v107, v138 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: ds_read_b128 v[146:149], v171 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[148:149], v[0:15] - ; GCN-NEXT: v_fma_f32 v131, s4, v108, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v104 - ; GCN-NEXT: v_exp_f32_e32 v108, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v105 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[148:149], v[32:47] - ; GCN-NEXT: v_fma_f32 v142, s4, v109, -v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[132:133], v[130:131], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v101 + ; GCN-NEXT: v_exp_f32_e32 v104, v136 + ; GCN-NEXT: v_fma_f32 v133, s4, v105, -v137 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v139 + ; GCN-NEXT: v_pack_b32_f16 v134, v134, v132 + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v102 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[140:141], v[130:131], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v105, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v133 + ; GCN-NEXT: v_fma_f32 v133, s4, v106, -v137 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v133 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[130:131], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v103 + ; GCN-NEXT: v_fma_f32 v131, s4, v107, -v137 + ; GCN-NEXT: v_exp_f32_e32 v106, v138 + ; GCN-NEXT: v_exp_f32_e32 v107, v135 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v135, v132, v130 + ; GCN-NEXT: ds_read_b128 v[130:133], v171 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[138:141], v171 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31] + ; GCN-NEXT: v_fma_f32 v127, s4, v108, -v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v104 + ; GCN-NEXT: v_exp_f32_e32 v108, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v105 + ; GCN-NEXT: v_fma_f32 v144, s4, v109, -v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[134:135], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v146, v126, v127 + ; GCN-NEXT: v_fma_f32 v127, s4, v110, -v137 ; GCN-NEXT: v_exp_f32_e32 v109, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v142 - ; GCN-NEXT: v_pack_b32_f16 v142, v130, v131 - ; GCN-NEXT: v_fma_f32 v131, s4, v110, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v106 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[148:149], v[16:31] - ; GCN-NEXT: v_mul_f32_e32 v134, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v107 - ; GCN-NEXT: v_exp_f32_e32 v110, v156 - ; GCN-NEXT: v_fma_f32 v135, s4, v111, -v128 - ; GCN-NEXT: v_mul_f32_e32 v135, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_pack_b32_f16 v143, v130, v131 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[148:149], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v111, v146 - ; GCN-NEXT: v_fma_f32 v139, s4, v80, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v108 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v80, v129 - ; GCN-NEXT: ds_read_b128 v[130:133], v197 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v109 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] - ; GCN-NEXT: v_fma_f32 v144, s4, v81, -v128 - ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v106 ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v144 - ; GCN-NEXT: v_pack_b32_f16 v144, v138, v139 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[142:143], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v110 - ; GCN-NEXT: v_fma_f32 v137, s4, v82, -v128 - ; GCN-NEXT: v_exp_f32_e32 v82, v134 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v111 - ; GCN-NEXT: v_mul_f32_e32 v156, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_fma_f32 v137, s4, v83, -v128 - ; GCN-NEXT: v_mul_f32_e32 v157, 0x3fb8aa3b, v137 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v83, v135 - ; GCN-NEXT: v_pack_b32_f16 v145, v136, v134 - ; GCN-NEXT: ds_read_b128 v[134:137], v197 offset:1152 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[134:135], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v130, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v107 + ; GCN-NEXT: v_exp_f32_e32 v110, v142 + ; GCN-NEXT: v_fma_f32 v131, s4, v111, -v137 + ; GCN-NEXT: v_mul_f32_e32 v131, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v147, v126, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v111, v143 + ; GCN-NEXT: v_fma_f32 v135, s4, v80, -v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v108 + ; GCN-NEXT: v_fma_f32 v138, s4, v81, -v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[146:147], v[16:31] + ; GCN-NEXT: ds_read_b128 v[126:129], v169 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v80, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v109 + ; GCN-NEXT: ds_read_b128 v[142:145], v169 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_pack_b32_f16 v134, v134, v135 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v81, v125 + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v110 + ; GCN-NEXT: v_fma_f32 v133, s4, v82, -v137 + ; GCN-NEXT: v_exp_f32_e32 v82, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v111 + ; GCN-NEXT: v_mul_f32_e32 v180, 0x3fb8aa3b, v133 + ; GCN-NEXT: v_fma_f32 v133, s4, v83, -v137 + ; GCN-NEXT: v_mul_f32_e32 v181, 0x3fb8aa3b, v133 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v83, v131 + ; GCN-NEXT: v_pack_b32_f16 v135, v132, v130 + ; GCN-NEXT: ds_read_b128 v[130:133], v169 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[138:141], v197 offset:1728 + ; GCN-NEXT: ds_read_b128 v[138:141], v169 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[126:127] + ; GCN-NEXT: ds_write_b64 v172, v[152:153] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[150:151] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[144:145], v[0:15] + ; GCN-NEXT: ds_write_b64 v173, v[154:155] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[152:153] + ; GCN-NEXT: ds_write_b64 v174, v[156:157] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[154:155] - ; GCN-NEXT: v_fma_f32 v127, s4, v84, -v128 - ; GCN-NEXT: v_exp_f32_e32 v84, v129 - ; GCN-NEXT: v_fma_f32 v130, s4, v85, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v80 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[144:145], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v85, v125 - ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v130 + ; GCN-NEXT: ds_write_b64 v175, v[150:151] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[130:131], v206, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[126:127], v176, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v81 - ; GCN-NEXT: v_pack_b32_f16 v126, v126, v127 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[144:145], v[16:31] - ; GCN-NEXT: v_fma_f32 v134, s4, v86, -v128 - ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v134 - ; GCN-NEXT: buffer_load_dwordx2 v[134:135], v203, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v177, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[142:143], v204, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[148:149], v178, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[146:147], v205, s[0:3], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[150:151], v179, s[0:3], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v82 - ; GCN-NEXT: v_exp_f32_e32 v86, v156 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[144:145], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v138, v83 + ; GCN-NEXT: v_fma_f32 v153, s4, v84, -v137 + ; GCN-NEXT: v_exp_f32_e32 v84, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v152, v80 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[134:135], v[32:47] + ; GCN-NEXT: v_fma_f32 v143, s4, v85, -v137 + ; GCN-NEXT: v_exp_f32_e32 v85, v125 + ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v81 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_fma_f32 v139, s4, v87, -v128 - ; GCN-NEXT: v_exp_f32_e32 v87, v157 - ; GCN-NEXT: v_pack_b32_f16 v127, v127, v138 - ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v128 - ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v139 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[126:127], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v153 + ; GCN-NEXT: v_pack_b32_f16 v142, v152, v142 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[134:135], v[0:15] + ; GCN-NEXT: v_fma_f32 v131, s4, v87, -v137 + ; GCN-NEXT: v_mul_f32_e32 v131, 0x3fb8aa3b, v131 ; GCN-NEXT: ; implicit-def: $sgpr0 - ; GCN-NEXT: v_perm_b32 v154, v135, v131, s5 - ; GCN-NEXT: v_perm_b32 v156, v135, v131, s7 - ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v128 - ; GCN-NEXT: v_perm_b32 v150, v134, v130, s5 - ; GCN-NEXT: v_perm_b32 v152, v134, v130, s7 - ; GCN-NEXT: ds_read_b128 v[130:133], v198 + ; GCN-NEXT: v_add_u32_e32 v170, s0, v170 + ; GCN-NEXT: v_add_u32_e32 v165, s0, v165 + ; GCN-NEXT: v_add_u32_e32 v168, s0, v168 + ; GCN-NEXT: v_add_u32_e32 v167, s0, v167 + ; GCN-NEXT: v_add_u32_e32 v166, s0, v166 + ; GCN-NEXT: v_add_u32_e32 v164, s0, v164 + ; GCN-NEXT: v_add_u32_e32 v163, s0, v163 + ; GCN-NEXT: v_add_u32_e32 v162, s0, v162 + ; GCN-NEXT: v_add_u32_e32 v161, s0, v161 + ; GCN-NEXT: v_perm_b32 v158, v147, v127, s5 + ; GCN-NEXT: v_perm_b32 v176, v147, v127, s7 + ; GCN-NEXT: v_fma_f32 v127, s4, v86, -v137 + ; GCN-NEXT: v_perm_b32 v154, v146, v126, s5 + ; GCN-NEXT: v_perm_b32 v156, v146, v126, s7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v82 + ; GCN-NEXT: v_mul_f32_e32 v130, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v83 + ; GCN-NEXT: v_exp_f32_e32 v86, v180 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v87, v181 + ; GCN-NEXT: v_pack_b32_f16 v143, v126, v127 + ; GCN-NEXT: v_fma_f32 v135, s4, v88, -v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 + ; GCN-NEXT: v_fma_f32 v138, s4, v89, -v137 + ; GCN-NEXT: v_perm_b32 v155, v150, v148, s5 + ; GCN-NEXT: v_perm_b32 v157, v150, v148, s7 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[142:143], v[16:31] + ; GCN-NEXT: ds_read_b128 v[126:129], v171 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v84 - ; GCN-NEXT: v_exp_f32_e32 v88, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v135 + ; GCN-NEXT: v_exp_f32_e32 v88, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v135 ; GCN-NEXT: v_cvt_f16_f32_e32 v135, v85 - ; GCN-NEXT: v_perm_b32 v151, v146, v142, s5 - ; GCN-NEXT: v_perm_b32 v153, v146, v142, s7 - ; GCN-NEXT: v_perm_b32 v155, v147, v143, s5 - ; GCN-NEXT: v_perm_b32 v157, v147, v143, s7 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:576 + ; GCN-NEXT: v_perm_b32 v159, v151, v149, s5 + ; GCN-NEXT: v_perm_b32 v177, v151, v149, s7 + ; GCN-NEXT: ds_read_b128 v[146:149], v171 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[126:127], v[32:47] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[142:143], v[32:47] ; GCN-NEXT: v_exp_f32_e32 v89, v125 - ; GCN-NEXT: v_pack_b32_f16 v146, v134, v135 - ; GCN-NEXT: v_cvt_f16_f32_e32 v134, v86 - ; GCN-NEXT: v_fma_f32 v135, s4, v90, -v128 + ; GCN-NEXT: v_pack_b32_f16 v134, v134, v135 ; GCN-NEXT: v_mul_f32_e32 v125, 0x3fb8aa3b, v138 - ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v135 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[126:127], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v90, v158 - ; GCN-NEXT: v_mul_f32_e32 v158, 0x3fb8aa3b, v64 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[126:127], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v87 - ; GCN-NEXT: v_fma_f32 v127, s4, v91, -v128 - ; GCN-NEXT: v_exp_f32_e32 v91, v139 - ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 - ; GCN-NEXT: v_pack_b32_f16 v147, v134, v126 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:1152 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v86 + ; GCN-NEXT: v_fma_f32 v133, s4, v90, -v137 + ; GCN-NEXT: v_exp_f32_e32 v90, v130 + ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v87 + ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v133 + ; GCN-NEXT: v_fma_f32 v133, s4, v91, -v137 + ; GCN-NEXT: v_pack_b32_f16 v135, v132, v130 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[142:143], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v91, v131 + ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v133 + ; GCN-NEXT: ds_read_b128 v[130:133], v171 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1728 + ; GCN-NEXT: ds_read_b128 v[138:141], v171 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[146:147], v[0:15] - ; GCN-NEXT: v_fma_f32 v130, s4, v92, -v128 + ; GCN-NEXT: v_fma_f32 v143, s4, v93, -v137 + ; GCN-NEXT: v_mul_f32_e32 v178, 0x3fb8aa3b, v143 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[126:127], v[134:135], v[16:31] ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v88 - ; GCN-NEXT: v_exp_f32_e32 v92, v129 - ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v130 - ; GCN-NEXT: v_cvt_f16_f32_e32 v130, v89 - ; GCN-NEXT: v_fma_f32 v131, s4, v93, -v128 - ; GCN-NEXT: v_pack_b32_f16 v130, v126, v130 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[142:143], v[146:147], v[32:47] + ; GCN-NEXT: v_fma_f32 v127, s4, v92, -v137 + ; GCN-NEXT: v_exp_f32_e32 v92, v136 + ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v89 + ; GCN-NEXT: v_mul_f32_e32 v127, 0x3fb8aa3b, v127 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[134:135], v[32:47] + ; GCN-NEXT: v_pack_b32_f16 v146, v126, v136 + ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v137 ; GCN-NEXT: v_exp_f32_e32 v93, v125 - ; GCN-NEXT: v_fma_f32 v126, s4, v94, -v128 ; GCN-NEXT: v_cvt_f16_f32_e32 v125, v90 - ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v126 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v137 + ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v137 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[130:131], v[134:135], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v130, 0x3fb8aa3b, v126 ; GCN-NEXT: v_cvt_f16_f32_e32 v126, v91 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[134:135], v[146:147], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v94, v148 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[146:147], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v95, v127 - ; GCN-NEXT: v_cvt_f16_f32_e32 v127, v92 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v131 - ; GCN-NEXT: v_pack_b32_f16 v131, v125, v126 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[130:131], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v125, v129 - ; GCN-NEXT: ds_read_b128 v[132:135], v197 + ; GCN-NEXT: v_exp_f32_e32 v94, v144 + ; GCN-NEXT: v_fma_f32 v131, s4, v95, -v137 + ; GCN-NEXT: v_mul_f32_e32 v131, 0x3fb8aa3b, v131 + ; GCN-NEXT: v_pack_b32_f16 v147, v125, v126 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v95, v142 + ; GCN-NEXT: ds_read_b128 v[142:145], v169 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[146:149], v197 offset:576 + ; GCN-NEXT: ds_read_b128 v[150:153], v169 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[144:145], v[130:131], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_fma_f32 v65, s4, v66, -v128 - ; GCN-NEXT: v_exp_f32_e32 v126, v142 - ; GCN-NEXT: v_pack_b32_f16 v142, v127, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[128:129], v[146:147], v[16:31] + ; GCN-NEXT: v_cvt_f16_f32_e32 v128, v92 + ; GCN-NEXT: v_mul_f32_e32 v129, 0x3fb8aa3b, v64 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v93 + ; GCN-NEXT: v_exp_f32_e32 v125, v127 + ; GCN-NEXT: v_pack_b32_f16 v134, v128, v64 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[146:147], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v126, v178 ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v94 - ; GCN-NEXT: v_mul_f32_e32 v145, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[146:147], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v132, 0x3fb8aa3b, v65 ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v95 - ; GCN-NEXT: v_fma_f32 v66, s4, v67, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[136:137], v[130:131], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v127, v143 - ; GCN-NEXT: v_pack_b32_f16 v143, v64, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[130:131], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v129, v138 - ; GCN-NEXT: v_mul_f32_e32 v141, 0x3fb8aa3b, v66 - ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:1152 + ; GCN-NEXT: v_mul_f32_e32 v133, 0x3fb8aa3b, v66 + ; GCN-NEXT: v_exp_f32_e32 v127, v130 + ; GCN-NEXT: v_pack_b32_f16 v135, v64, v65 + ; GCN-NEXT: ds_read_b128 v[64:67], v169 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[136:139], v197 offset:1728 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[146:147], v[48:63] + ; GCN-NEXT: ds_read_b128 v[138:141], v169 offset:1728 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v128, v131 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b64 v199, v[150:151] + ; GCN-NEXT: ds_write_b64 v172, v[154:155] + ; GCN-NEXT: v_cvt_f16_f32_e32 v131, v125 + ; GCN-NEXT: v_mul_f32_e32 v154, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[142:143], v[134:135], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v129, v129 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v200, v[152:153] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[132:133], v[142:143], v[0:15] - ; GCN-NEXT: v_cvt_f16_f32_e32 v132, v125 - ; GCN-NEXT: v_exp_f32_e32 v130, v158 + ; GCN-NEXT: ds_write_b64 v173, v[156:157] + ; GCN-NEXT: v_pack_b32_f16 v68, v131, v68 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v201, v[154:155] + ; GCN-NEXT: ds_write_b64 v174, v[158:159] ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b64 v202, v[156:157] + ; GCN-NEXT: ds_write_b64 v175, v[176:177] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[142:143], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v126 - ; GCN-NEXT: v_exp_f32_e32 v131, v144 - ; GCN-NEXT: v_mul_f32_e32 v144, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_fma_f32 v69, s4, v71, -v128 - ; GCN-NEXT: v_pack_b32_f16 v140, v132, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v129 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[142:143], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[150:151], v[134:135], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v130, v136 + ; GCN-NEXT: v_mul_f32_e32 v136, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v128 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[134:135], v[0:15] ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v127 - ; GCN-NEXT: v_exp_f32_e32 v132, v145 - ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v128 + ; GCN-NEXT: v_exp_f32_e32 v131, v132 + ; GCN-NEXT: v_fma_f32 v65, s4, v70, -v137 + ; GCN-NEXT: v_fma_f32 v70, s4, v71, -v137 + ; GCN-NEXT: v_pack_b32_f16 v69, v64, v69 + ; GCN-NEXT: v_fma_f32 v71, s4, v72, -v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v129 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[134:135], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v132, v133 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v71 + ; GCN-NEXT: v_cvt_f16_f32_e32 v71, v130 ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_fma_f32 v145, s4, v73, -v128 - ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v145 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[136:137], v[142:143], v[48:63] - ; GCN-NEXT: v_exp_f32_e32 v133, v141 - ; GCN-NEXT: v_mul_f32_e32 v142, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_pack_b32_f16 v141, v64, v68 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[68:71], v198 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_fma_f32 v143, s4, v72, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v130 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[134:135], v[140:141], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v72, v146 - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v143 - ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v131 - ; GCN-NEXT: ds_read_b128 v[134:137], v198 offset:576 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v64, v64, v143 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[140:141], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v73, v144 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[140:141], v[16:31] - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v132 - ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v128 - ; GCN-NEXT: v_exp_f32_e32 v74, v65 - ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v133 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[138:139], v[140:141], v[48:63] - ; GCN-NEXT: v_fma_f32 v138, s4, v75, -v128 - ; GCN-NEXT: v_exp_f32_e32 v75, v142 - ; GCN-NEXT: v_mul_f32_e32 v148, 0x3fb8aa3b, v138 - ; GCN-NEXT: ds_read_b128 v[138:141], v198 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[142:145], v198 offset:1728 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v72 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] - ; GCN-NEXT: v_fma_f32 v68, s4, v76, -v128 - ; GCN-NEXT: v_exp_f32_e32 v76, v146 - ; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v68, v73 - ; GCN-NEXT: v_fma_f32 v69, s4, v77, -v128 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v77, v147 - ; GCN-NEXT: v_pack_b32_f16 v134, v66, v68 - ; GCN-NEXT: v_fma_f32 v68, s4, v78, -v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v74 - ; GCN-NEXT: v_mul_f32_e32 v147, 0x3fb8aa3b, v69 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v78, v67 - ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v68 - ; GCN-NEXT: v_cvt_f16_f32_e32 v139, v76 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[142:143], v[64:65], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v75 - ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v128 - ; GCN-NEXT: v_exp_f32_e32 v79, v148 - ; GCN-NEXT: v_mul_f32_e32 v128, 0x3fb8aa3b, v65 - ; GCN-NEXT: v_pack_b32_f16 v135, v66, v64 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[134:135], v[0:15] - ; GCN-NEXT: v_exp_f32_e32 v142, v146 - ; GCN-NEXT: ds_read_b128 v[68:71], v197 + ; GCN-NEXT: v_mul_f32_e32 v70, 0x3fb8aa3b, v70 + ; GCN-NEXT: v_fma_f32 v72, s4, v73, -v137 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v72 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[144:145], v[68:69], v[16:31] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[64:67], v197 offset:576 + ; GCN-NEXT: ds_read_b128 v[142:145], v171 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[136:137], v[134:135], v[32:47] - ; GCN-NEXT: v_exp_f32_e32 v137, v147 - ; GCN-NEXT: v_cvt_f16_f32_e32 v136, v77 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[140:141], v[134:135], v[16:31] - ; GCN-NEXT: v_exp_f32_e32 v138, v138 - ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v78 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63] - ; GCN-NEXT: s_nop 10 - ; GCN-NEXT: v_exp_f32_e32 v52, v128 - ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 - ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v142 - ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v138 - ; GCN-NEXT: v_cvt_f16_f32_e32 v53, v52 - ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v79 - ; GCN-NEXT: v_pack_b32_f16 v50, v51, v50 - ; GCN-NEXT: v_pack_b32_f16 v48, v139, v136 - ; GCN-NEXT: v_pack_b32_f16 v51, v54, v53 - ; GCN-NEXT: v_add_f32_e32 v53, 0, v113 - ; GCN-NEXT: v_add_f32_e32 v53, v114, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v115, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v116, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v117, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v118, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v119, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v120, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v121, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v122, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v123, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v124, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v96, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v97, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v98, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v99, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v100, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v101, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v102, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v103, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v104, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v105, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v106, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v107, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v108, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v109, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v110, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v111, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v80, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v81, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v82, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v83, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v84, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v85, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v86, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v87, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v88, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v89, v53 - ; GCN-NEXT: v_pack_b32_f16 v49, v140, v49 - ; GCN-NEXT: v_add_f32_e32 v53, v90, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v91, v53 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15] - ; GCN-NEXT: v_add_f32_e32 v53, v92, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v93, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v94, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v95, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v125, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v126, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v127, v53 - ; GCN-NEXT: v_add_f32_e32 v53, v129, v53 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47] - ; GCN-NEXT: s_nop 9 - ; GCN-NEXT: v_add_f32_e32 v0, v130, v53 - ; GCN-NEXT: v_add_f32_e32 v0, v131, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v132, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v133, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v72, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v73, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v74, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v75, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v76, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v77, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v78, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v79, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v142, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v137, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v138, v0 - ; GCN-NEXT: v_add_f32_e32 v4, v52, v0 - ; GCN-NEXT: ds_bpermute_b32 v5, v196, v4 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1152 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31] - ; GCN-NEXT: v_add_f32_e32 v2, v4, v5 - ; GCN-NEXT: ds_bpermute_b32 v3, v196, v2 - ; GCN-NEXT: ; implicit-def: $vgpr4 + ; GCN-NEXT: ds_read_b128 v[146:149], v171 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[12:13] - ; GCN-NEXT: v_fmac_f32_e32 v0, v4, v112 - ; GCN-NEXT: ds_read_b128 v[0:3], v197 offset:1728 + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_exp_f32_e32 v133, v154 + ; GCN-NEXT: v_pack_b32_f16 v64, v64, v71 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[152:153], v[68:69], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v134, v136 + ; GCN-NEXT: ds_read_b128 v[150:153], v171 offset:1152 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[68:69], v[0:15] + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v131 + ; GCN-NEXT: v_exp_f32_e32 v135, v65 + ; GCN-NEXT: v_cvt_f16_f32_e32 v65, v132 + ; GCN-NEXT: v_fma_f32 v67, s4, v74, -v137 + ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 + ; GCN-NEXT: v_fma_f32 v74, s4, v77, -v137 + ; GCN-NEXT: v_pack_b32_f16 v65, v66, v65 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[140:141], v[68:69], v[48:63] + ; GCN-NEXT: v_exp_f32_e32 v136, v70 + ; GCN-NEXT: ds_read_b128 v[70:73], v171 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_fma_f32 v69, s4, v76, -v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v133 + ; GCN-NEXT: v_fma_f32 v68, s4, v75, -v137 + ; GCN-NEXT: v_mul_f32_e32 v68, 0x3fb8aa3b, v68 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[142:143], v[64:65], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v76, v138 + ; GCN-NEXT: v_mul_f32_e32 v138, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v134 + ; GCN-NEXT: v_mul_f32_e32 v143, 0x3fb8aa3b, v74 + ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v76 + ; GCN-NEXT: v_pack_b32_f16 v74, v66, v69 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[146:147], v[64:65], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v77, v139 + ; GCN-NEXT: v_fma_f32 v69, s4, v78, -v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v66, v135 + ; GCN-NEXT: v_mul_f32_e32 v140, 0x3fb8aa3b, v69 + ; GCN-NEXT: v_cvt_f16_f32_e32 v142, v77 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[150:151], v[64:65], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v78, v67 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[64:65], v[48:63] + ; GCN-NEXT: v_cvt_f16_f32_e32 v64, v136 + ; GCN-NEXT: v_fma_f32 v65, s4, v79, -v137 + ; GCN-NEXT: v_exp_f32_e32 v79, v68 + ; GCN-NEXT: v_mul_f32_e32 v139, 0x3fb8aa3b, v65 + ; GCN-NEXT: v_pack_b32_f16 v75, v66, v64 + ; GCN-NEXT: ds_read_b128 v[64:67], v169 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_read_b128 v[68:71], v169 offset:576 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[144:145], v[74:75], v[16:31] + ; GCN-NEXT: v_exp_f32_e32 v137, v138 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[148:149], v[74:75], v[32:47] + ; GCN-NEXT: v_exp_f32_e32 v138, v143 + ; GCN-NEXT: v_cvt_f16_f32_e32 v143, v78 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[152:153], v[74:75], v[0:15] + ; GCN-NEXT: v_exp_f32_e32 v140, v140 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[74:75], v[48:63] + ; GCN-NEXT: ; implicit-def: $vgpr58 + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_add_f32_e32 v57, 0, v113 + ; GCN-NEXT: v_add_f32_e32 v57, v114, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v115, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v116, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v117, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v118, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v119, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v120, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v121, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v122, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v123, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v124, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v96, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v97, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v98, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v99, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v100, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v101, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v102, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v103, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v104, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v105, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v106, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v107, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v108, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v109, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v110, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v111, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v80, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v81, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v82, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v83, v57 + ; GCN-NEXT: v_cvt_f16_f32_e32 v48, v79 + ; GCN-NEXT: v_add_f32_e32 v57, v84, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v85, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v86, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v87, v57 + ; GCN-NEXT: v_exp_f32_e32 v56, v139 + ; GCN-NEXT: v_pack_b32_f16 v52, v141, v142 + ; GCN-NEXT: v_pack_b32_f16 v53, v143, v48 + ; GCN-NEXT: v_add_f32_e32 v57, v88, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v89, v57 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[64:65], v[52:53], v[16:31] + ; GCN-NEXT: v_add_f32_e32 v57, v90, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v91, v57 + ; GCN-NEXT: v_cvt_f16_f32_e32 v49, v138 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v137 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v56 + ; GCN-NEXT: v_cvt_f16_f32_e32 v55, v140 + ; GCN-NEXT: v_add_f32_e32 v57, v92, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v93, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v94, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v95, v57 + ; GCN-NEXT: v_pack_b32_f16 v54, v50, v49 + ; GCN-NEXT: v_pack_b32_f16 v55, v55, v51 + ; GCN-NEXT: v_add_f32_e32 v57, v125, v57 + ; GCN-NEXT: v_add_f32_e32 v57, v126, v57 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[66:67], v[54:55], v[16:31] + ; GCN-NEXT: ds_read_b128 v[48:51], v169 offset:1152 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[48:49], v[52:53], v[0:15] + ; GCN-NEXT: s_nop 6 + ; GCN-NEXT: v_add_f32_e32 v16, v127, v57 + ; GCN-NEXT: v_add_f32_e32 v16, v128, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v129, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v130, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v131, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v132, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v133, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v134, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v135, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v136, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v76, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v77, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v78, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v79, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v137, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v138, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v140, v16 + ; GCN-NEXT: v_add_f32_e32 v20, v56, v16 + ; GCN-NEXT: ds_read_b128 v[16:19], v169 offset:1728 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: ds_bpermute_b32 v16, v160, v20 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[52:53], v[32:47] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_add_f32_e32 v16, v20, v16 + ; GCN-NEXT: ds_bpermute_b32 v17, v160, v16 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[50:51], v[54:55], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[54:55], v[32:47] + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: s_nop 8 + ; GCN-NEXT: v_cndmask_b32_e64 v0, v17, v16, s[12:13] + ; GCN-NEXT: v_fmac_f32_e32 v0, v58, v112 ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir index 0887fdf0844b0..677dad6083c3b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir @@ -13,21 +13,18 @@ ; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: ; implicit-def: $vgpr50 + ; GCN-NEXT: ; implicit-def: $vgpr52 ; GCN-NEXT: ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19 - ; GCN-NEXT: ; implicit-def: $vgpr49 + ; GCN-NEXT: ; implicit-def: $vgpr9 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35 + ; GCN-NEXT: ; implicit-def: $vgpr70 ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 - ; GCN-NEXT: ; implicit-def: $vgpr51 - ; GCN-NEXT: ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65 - ; GCN-NEXT: ; implicit-def: $vgpr76 - ; GCN-NEXT: ; implicit-def: $vgpr77 - ; GCN-NEXT: ; implicit-def: $vgpr78 - ; GCN-NEXT: ; implicit-def: $vgpr79 - ; GCN-NEXT: ; implicit-def: $vgpr80 - ; GCN-NEXT: ; implicit-def: $vgpr91 - ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 - ; GCN-NEXT: ; iglp_opt mask(0x00000002) - ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: ; implicit-def: $vgpr58_vgpr59_vgpr60_vgpr61 + ; GCN-NEXT: ; implicit-def: $vgpr66_vgpr67_vgpr68_vgpr69 + ; GCN-NEXT: ; implicit-def: $vgpr54 + ; GCN-NEXT: ; implicit-def: $vgpr55 + ; GCN-NEXT: v_add_u32_e32 v71, v54, v52 + ; GCN-NEXT: v_add_u32_e32 v72, v55, v52 ; GCN-NEXT: v_lshl_add_u32 v2, s20, 4, v3 ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1] ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1 @@ -35,459 +32,465 @@ ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: s_lshl_b32 s4, s20, 7 ; GCN-NEXT: ; implicit-def: $vgpr5 - ; GCN-NEXT: v_add_lshl_u32 v48, v5, s4, 1 - ; GCN-NEXT: v_add_u32_e32 v76, s20, v76 - ; GCN-NEXT: v_and_b32_e32 v76, 0x1fffffff, v76 + ; GCN-NEXT: v_add_lshl_u32 v8, v5, s4, 1 + ; GCN-NEXT: ; implicit-def: $vgpr5 + ; GCN-NEXT: ; implicit-def: $vgpr57 + ; GCN-NEXT: ; implicit-def: $vgpr53 + ; GCN-NEXT: ; kill: killed $vgpr72 + ; GCN-NEXT: ; kill: killed $vgpr71 + ; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19 + ; GCN-NEXT: ; iglp_opt mask(0x00000002) + ; GCN-NEXT: v_add_u32_e32 v5, v5, v52 + ; GCN-NEXT: ; kill: killed $vgpr5 ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[0:3] + ; GCN-NEXT: ds_write_b128 v8, v[0:3] ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen offset:64 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr0 - ; GCN-NEXT: ; implicit-def: $vgpr1 + ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: ; implicit-def: $sgpr6 - ; GCN-NEXT: v_add_u32_e32 v0, v0, v50 - ; GCN-NEXT: v_add_u32_e32 v1, v1, v50 - ; GCN-NEXT: buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: ; implicit-def: $sgpr1 + ; GCN-NEXT: v_add_u32_e32 v4, v4, v52 + ; GCN-NEXT: buffer_load_dwordx2 v[48:49], v4, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[50:51], v5, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: ds_read_b128 v[36:39], v49 + ; GCN-NEXT: ; kill: killed $vgpr4 + ; GCN-NEXT: ds_read_b128 v[4:7], v9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[44:47], v49 offset:512 + ; GCN-NEXT: ds_read_b128 v[36:39], v9 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0 - ; GCN-NEXT: ; kill: killed $vgpr1 - ; GCN-NEXT: ; kill: killed $vgpr0 - ; GCN-NEXT: v_mul_lo_u32 v76, v76, s6 - ; GCN-NEXT: v_add_lshl_u32 v76, v77, v76, 1 - ; GCN-NEXT: v_lshl_add_u32 v77, v78, 1, v76 - ; GCN-NEXT: ; implicit-def: $sgpr5 - ; GCN-NEXT: v_lshl_add_u32 v78, v79, 1, v77 ; GCN-NEXT: ; implicit-def: $sgpr2 ; GCN-NEXT: ; implicit-def: $sgpr3 - ; GCN-NEXT: v_lshl_add_u32 v79, v80, 1, v78 - ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ds_read_b128 v[36:39], v51 + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[4:5], v[32:33], 0 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[6:7], v[34:35], v[16:31] + ; GCN-NEXT: ds_read_b128 v[4:7], v70 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[44:47], v51 offset:512 + ; GCN-NEXT: ds_read_b128 v[44:47], v70 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b128 v48, v[32:35] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31] + ; GCN-NEXT: ds_write_b128 v8, v[0:3] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[4:5], v[40:41], v[16:31] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[32:35], v49 + ; GCN-NEXT: ds_read_b128 v[0:3], v9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31] - ; GCN-NEXT: ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] - ; GCN-NEXT: ds_read_b128 v[40:43], v49 offset:512 + ; GCN-NEXT: ds_read_b128 v[62:65], v9 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v51 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[6:7], v[42:43], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[58:59], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[60:61], v[16:31] + ; GCN-NEXT: ds_read_b128 v[0:3], v70 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[66:67], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[2:3], v[68:69], v[16:31] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[36:37], v[32:33], 0 ; GCN-NEXT: ; implicit-def: $vgpr32 ; GCN-NEXT: ; implicit-def: $vgpr33 - ; GCN-NEXT: v_add_u32_e32 v82, v32, v50 - ; GCN-NEXT: v_add_u32_e32 v83, v33, v50 - ; GCN-NEXT: ; kill: killed $vgpr82 - ; GCN-NEXT: ; kill: killed $vgpr83 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31] - ; GCN-NEXT: ds_read_b128 v[66:69], v51 offset:512 + ; GCN-NEXT: ; implicit-def: $vgpr36 + ; GCN-NEXT: ; implicit-def: $vgpr37 + ; GCN-NEXT: v_add_u32_e32 v32, s20, v32 + ; GCN-NEXT: v_and_b32_e32 v32, 0x1fffffff, v32 + ; GCN-NEXT: v_mul_lo_u32 v32, v32, s1 + ; GCN-NEXT: v_add_lshl_u32 v54, v33, v32, 1 + ; GCN-NEXT: v_lshl_add_u32 v55, v36, 1, v54 + ; GCN-NEXT: v_lshl_add_u32 v56, v37, 1, v55 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[38:39], v[34:35], v[0:15] + ; GCN-NEXT: v_lshl_add_u32 v57, v57, 1, v56 + ; GCN-NEXT: v_perm_b32 v36, v50, v48, s2 + ; GCN-NEXT: v_perm_b32 v37, v50, v48, s3 + ; GCN-NEXT: v_perm_b32 v38, v51, v49, s2 + ; GCN-NEXT: v_perm_b32 v39, v51, v49, s3 + ; GCN-NEXT: ds_read_b128 v[32:35], v70 offset:512 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15] - ; GCN-NEXT: ; implicit-def: $vgpr66 - ; GCN-NEXT: ; implicit-def: $vgpr67 - ; GCN-NEXT: v_max_f32_e32 v81, v67, v67 - ; GCN-NEXT: ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31] - ; GCN-NEXT: v_perm_b32 v70, v74, v72, s2 - ; GCN-NEXT: v_perm_b32 v71, v74, v72, s3 - ; GCN-NEXT: v_perm_b32 v72, v75, v73, s2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15] ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v70 + ; GCN-NEXT: ds_write_b32 v54, v36 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v71 + ; GCN-NEXT: ds_write_b32 v55, v37 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v72 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v20 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15] - ; GCN-NEXT: v_mul_f32_e32 v64, s4, v16 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v17 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v18 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v19 - ; GCN-NEXT: v_max3_f32 v64, v64, s5, v65 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v21 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v22 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v23 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v24 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v25 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v26 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v27 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v87 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v28 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v29 - ; GCN-NEXT: v_max3_f32 v64, v64, v65, v68 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v30 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v31 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v0 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v1 - ; GCN-NEXT: v_max3_f32 v64, v64, v80, v84 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v2 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v3 - ; GCN-NEXT: v_max3_f32 v64, v64, v85, v86 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v4 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v5 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v65 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v6 - ; GCN-NEXT: v_mul_f32_e32 v80, s4, v7 - ; GCN-NEXT: v_max3_f32 v64, v64, v68, v69 - ; GCN-NEXT: v_mul_f32_e32 v84, s4, v8 - ; GCN-NEXT: v_mul_f32_e32 v85, s4, v9 - ; GCN-NEXT: v_max3_f32 v64, v64, v74, v80 - ; GCN-NEXT: v_mul_f32_e32 v86, s4, v10 - ; GCN-NEXT: v_mul_f32_e32 v65, s4, v11 - ; GCN-NEXT: v_max3_f32 v64, v64, v84, v85 - ; GCN-NEXT: v_mul_f32_e32 v87, s4, v12 - ; GCN-NEXT: v_mul_f32_e32 v68, s4, v13 - ; GCN-NEXT: v_max3_f32 v64, v64, v86, v65 - ; GCN-NEXT: v_mul_f32_e32 v69, s4, v14 - ; GCN-NEXT: v_mul_f32_e32 v74, s4, v15 - ; GCN-NEXT: v_max3_f32 v64, v64, v87, v68 - ; GCN-NEXT: v_max3_f32 v64, v64, v69, v74 - ; GCN-NEXT: ds_bpermute_b32 v65, v66, v64 - ; GCN-NEXT: v_perm_b32 v68, v75, v73, s3 + ; GCN-NEXT: ds_write_b32 v56, v38 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v68 - ; GCN-NEXT: ; implicit-def: $vgpr84 - ; GCN-NEXT: v_max_f32_e32 v65, v65, v65 - ; GCN-NEXT: v_max_f32_e32 v70, v64, v65 + ; GCN-NEXT: ds_write_b32 v57, v39 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[48:49], v71, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1 + ; GCN-NEXT: buffer_load_dwordx2 v[50:51], v72, s[16:19], 0 offen sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_bpermute_b32 v71, v66, v70 + ; GCN-NEXT: v_mul_f32_e32 v36, s4, v16 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v37, s4, v17 + ; GCN-NEXT: v_mul_f32_e32 v38, s4, v18 + ; GCN-NEXT: v_mul_f32_e32 v39, s4, v19 + ; GCN-NEXT: v_max3_f32 v36, v36, s0, v37 + ; GCN-NEXT: v_mul_f32_e32 v40, s4, v20 + ; GCN-NEXT: v_mul_f32_e32 v41, s4, v21 + ; GCN-NEXT: v_mul_f32_e32 v42, s4, v22 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[62:63], v[58:59], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v43, s4, v23 + ; GCN-NEXT: v_mul_f32_e32 v44, s4, v24 + ; GCN-NEXT: v_mul_f32_e32 v45, s4, v25 + ; GCN-NEXT: v_mul_f32_e32 v46, s4, v26 + ; GCN-NEXT: v_mul_f32_e32 v47, s4, v27 + ; GCN-NEXT: v_mul_f32_e32 v58, s4, v28 + ; GCN-NEXT: v_mul_f32_e32 v59, s4, v29 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[64:65], v[60:61], v[0:15] + ; GCN-NEXT: v_mul_f32_e32 v60, s4, v30 + ; GCN-NEXT: v_mul_f32_e32 v61, s4, v31 + ; GCN-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GCN-NEXT: ; implicit-def: $vgpr64 + ; GCN-NEXT: v_max_f32_e32 v65, v64, v64 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[32:33], v[66:67], v[0:15] + ; GCN-NEXT: v_max3_f32 v32, v36, v38, v39 + ; GCN-NEXT: v_max3_f32 v32, v32, v40, v41 + ; GCN-NEXT: v_max3_f32 v32, v32, v42, v43 + ; GCN-NEXT: v_max3_f32 v32, v32, v44, v45 + ; GCN-NEXT: v_max3_f32 v32, v32, v46, v47 + ; GCN-NEXT: v_max3_f32 v32, v32, v58, v59 + ; GCN-NEXT: v_max3_f32 v32, v32, v60, v61 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[34:35], v[68:69], v[0:15] + ; GCN-NEXT: ; implicit-def: $vgpr66 + ; GCN-NEXT: ; implicit-def: $vgpr67 + ; GCN-NEXT: s_nop 10 + ; GCN-NEXT: v_mul_f32_e32 v33, s4, v0 + ; GCN-NEXT: v_mul_f32_e32 v34, s4, v1 + ; GCN-NEXT: v_mul_f32_e32 v35, s4, v2 + ; GCN-NEXT: v_mul_f32_e32 v36, s4, v3 + ; GCN-NEXT: v_max3_f32 v32, v32, v33, v34 + ; GCN-NEXT: v_mul_f32_e32 v37, s4, v4 + ; GCN-NEXT: v_mul_f32_e32 v38, s4, v5 + ; GCN-NEXT: v_max3_f32 v32, v32, v35, v36 + ; GCN-NEXT: v_mul_f32_e32 v39, s4, v6 + ; GCN-NEXT: v_mul_f32_e32 v40, s4, v7 + ; GCN-NEXT: v_max3_f32 v32, v32, v37, v38 + ; GCN-NEXT: v_mul_f32_e32 v41, s4, v8 + ; GCN-NEXT: v_mul_f32_e32 v42, s4, v9 + ; GCN-NEXT: v_max3_f32 v32, v32, v39, v40 + ; GCN-NEXT: v_mul_f32_e32 v43, s4, v10 + ; GCN-NEXT: v_mul_f32_e32 v44, s4, v11 + ; GCN-NEXT: v_max3_f32 v32, v32, v41, v42 + ; GCN-NEXT: v_mul_f32_e32 v45, s4, v12 + ; GCN-NEXT: v_mul_f32_e32 v46, s4, v13 + ; GCN-NEXT: v_max3_f32 v32, v32, v43, v44 + ; GCN-NEXT: v_mul_f32_e32 v47, s4, v14 + ; GCN-NEXT: v_mul_f32_e32 v58, s4, v15 + ; GCN-NEXT: v_max3_f32 v32, v32, v45, v46 + ; GCN-NEXT: v_max3_f32 v32, v32, v47, v58 + ; GCN-NEXT: ds_bpermute_b32 v33, v53, v32 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v70, v71, v70, s[0:1] - ; GCN-NEXT: v_max_f32_e32 v70, v70, v70 - ; GCN-NEXT: v_max_f32_e32 v72, v81, v70 - ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72 - ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72 - ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72 + ; GCN-NEXT: ds_read_b128 v[58:61], v66 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: buffer_inv sc0 sc1 + ; GCN-NEXT: v_max_f32_e32 v33, v33, v33 + ; GCN-NEXT: v_max_f32_e32 v62, v32, v33 + ; GCN-NEXT: ds_bpermute_b32 v63, v53, v62 + ; GCN-NEXT: ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GCN-NEXT: s_waitcnt lgkmcnt(0) + ; GCN-NEXT: v_cndmask_b32_e64 v62, v63, v62, s[0:1] + ; GCN-NEXT: v_max_f32_e32 v62, v62, v62 + ; GCN-NEXT: v_max_f32_e32 v63, v65, v62 + ; GCN-NEXT: v_fma_f32 v16, s4, v16, -v63 + ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v63 + ; GCN-NEXT: v_fma_f32 v18, s4, v18, -v63 + ; GCN-NEXT: v_fma_f32 v19, s4, v19, -v63 ; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16 + ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 ; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19 - ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72 - ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72 - ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72 - ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72 - ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72 - ; GCN-NEXT: v_exp_f32_e32 v73, v16 - ; GCN-NEXT: v_exp_f32_e32 v74, v18 - ; GCN-NEXT: v_exp_f32_e32 v75, v19 + ; GCN-NEXT: v_fma_f32 v20, s4, v20, -v63 + ; GCN-NEXT: v_fma_f32 v21, s4, v21, -v63 + ; GCN-NEXT: v_fma_f32 v22, s4, v22, -v63 + ; GCN-NEXT: v_fma_f32 v23, s4, v23, -v63 + ; GCN-NEXT: v_exp_f32_e32 v16, v16 + ; GCN-NEXT: v_exp_f32_e32 v68, v17 + ; GCN-NEXT: v_exp_f32_e32 v69, v18 + ; GCN-NEXT: v_exp_f32_e32 v70, v19 + ; GCN-NEXT: v_sub_f32_e32 v62, v64, v63 + ; GCN-NEXT: v_mul_f32_e32 v62, 0x3fb8aa3b, v62 ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 ; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21 ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22 - ; GCN-NEXT: v_exp_f32_e32 v80, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73 - ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72 - ; GCN-NEXT: v_exp_f32_e32 v81, v21 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74 - ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72 - ; GCN-NEXT: v_exp_f32_e32 v82, v22 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75 - ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17 ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72 - ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22 - ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_sub_f32_e32 v24, v67, v72 + ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 + ; GCN-NEXT: v_fma_f32 v18, s4, v24, -v63 + ; GCN-NEXT: v_exp_f32_e32 v71, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v68 + ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v63 + ; GCN-NEXT: v_exp_f32_e32 v72, v21 + ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v69 + ; GCN-NEXT: v_fma_f32 v24, s4, v26, -v63 + ; GCN-NEXT: v_exp_f32_e32 v73, v22 + ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v70 + ; GCN-NEXT: v_fma_f32 v74, s4, v27, -v63 + ; GCN-NEXT: v_exp_f32_e32 v75, v23 + ; GCN-NEXT: v_exp_f32_e32 v62, v62 + ; GCN-NEXT: v_pack_b32_f16 v64, v17, v19 + ; GCN-NEXT: v_pack_b32_f16 v65, v21, v22 + ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18 + ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20 + ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[58:59], v[64:65], v[32:47] + ; GCN-NEXT: v_add_f32_e32 v76, 0, v16 + ; GCN-NEXT: v_cvt_f16_f32_e32 v58, v71 + ; GCN-NEXT: v_fma_f32 v59, s4, v28, -v63 + ; GCN-NEXT: v_exp_f32_e32 v77, v18 + ; GCN-NEXT: v_cvt_f16_f32_e32 v78, v72 + ; GCN-NEXT: v_fma_f32 v79, s4, v29, -v63 + ; GCN-NEXT: v_exp_f32_e32 v80, v20 + ; GCN-NEXT: v_cvt_f16_f32_e32 v81, v73 + ; GCN-NEXT: v_fma_f32 v82, s4, v30, -v63 ; GCN-NEXT: v_exp_f32_e32 v83, v23 - ; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72 - ; GCN-NEXT: v_exp_f32_e32 v85, v22 - ; GCN-NEXT: v_exp_f32_e32 v17, v17 - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17 - ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72 - ; GCN-NEXT: v_exp_f32_e32 v88, v23 - ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72 - ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 - ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_exp_f32_e32 v16, v24 - ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576 + ; GCN-NEXT: v_cvt_f16_f32_e32 v84, v75 + ; GCN-NEXT: v_perm_b32 v28, v50, v48, s2 + ; GCN-NEXT: v_perm_b32 v29, v50, v48, s3 + ; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v74 + ; GCN-NEXT: v_fma_f32 v85, s4, v31, -v63 + ; GCN-NEXT: ds_read_b128 v[16:19], v66 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v18, 0, v73 - ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83 - ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80 - ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72 - ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v17, v18 - ; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81 - ; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72 - ; GCN-NEXT: v_exp_f32_e32 v30, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82 - ; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72 - ; GCN-NEXT: v_perm_b32 v31, v68, v64, s2 - ; GCN-NEXT: v_perm_b32 v64, v68, v64, s3 - ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3 - ; GCN-NEXT: ds_read_b128 v[26:29], v91 + ; GCN-NEXT: ds_read_b128 v[20:23], v67 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: ds_read_b128 v[68:71], v91 offset:576 + ; GCN-NEXT: v_perm_b32 v30, v51, v49, s2 + ; GCN-NEXT: v_perm_b32 v31, v51, v49, s3 + ; GCN-NEXT: v_exp_f32_e32 v74, v48 + ; GCN-NEXT: v_pack_b32_f16 v48, v58, v78 + ; GCN-NEXT: v_pack_b32_f16 v49, v81, v84 + ; GCN-NEXT: ds_read_b128 v[24:27], v67 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_wbl2 sc0 sc1 - ; GCN-NEXT: ds_write_b32 v76, v31 - ; GCN-NEXT: v_mul_f32_e32 v31, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_exp_f32_e32 v31, v31 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v18 - ; GCN-NEXT: v_pack_b32_f16 v18, v19, v86 - ; GCN-NEXT: v_pack_b32_f16 v19, v22, v89 + ; GCN-NEXT: ds_write_b32 v54, v28 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[60:61], v[48:49], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v50, 0x3fb8aa3b, v59 + ; GCN-NEXT: v_mul_f32_e32 v51, 0x3fb8aa3b, v79 + ; GCN-NEXT: v_mul_f32_e32 v54, 0x3fb8aa3b, v82 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v77, v64 + ; GCN-NEXT: ds_write_b32 v55, v29 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v78, v90 + ; GCN-NEXT: ds_write_b32 v56, v30 + ; GCN-NEXT: v_cvt_f16_f32_e32 v56, v77 + ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v63 + ; GCN-NEXT: v_exp_f32_e32 v60, v50 + ; GCN-NEXT: v_cvt_f16_f32_e32 v50, v80 + ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v63 + ; GCN-NEXT: v_exp_f32_e32 v61, v51 + ; GCN-NEXT: v_cvt_f16_f32_e32 v51, v83 + ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v63 + ; GCN-NEXT: v_exp_f32_e32 v78, v54 + ; GCN-NEXT: v_cvt_f16_f32_e32 v54, v74 + ; GCN-NEXT: v_mul_f32_e32 v55, 0x3fb8aa3b, v85 + ; GCN-NEXT: v_fma_f32 v3, s4, v3, -v63 + ; GCN-NEXT: v_exp_f32_e32 v79, v55 + ; GCN-NEXT: v_pack_b32_f16 v50, v56, v50 + ; GCN-NEXT: v_pack_b32_f16 v51, v51, v54 + ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[50:51], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v60 + ; GCN-NEXT: v_fma_f32 v4, s4, v4, -v63 + ; GCN-NEXT: v_exp_f32_e32 v81, v0 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v61 + ; GCN-NEXT: v_fma_f32 v5, s4, v5, -v63 + ; GCN-NEXT: v_exp_f32_e32 v82, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v78 + ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v63 + ; GCN-NEXT: v_exp_f32_e32 v84, v2 + ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v79 ; GCN-NEXT: buffer_wbl2 sc0 sc1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_write_b32 v79, v65 - ; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v74, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85 - ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72 - ; GCN-NEXT: v_exp_f32_e32 v22, v64 - ; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88 - ; GCN-NEXT: v_exp_f32_e32 v64, v65 - ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v75, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30 - ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72 - ; GCN-NEXT: v_exp_f32_e32 v23, v23 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0 - ; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1 - ; GCN-NEXT: v_pack_b32_f16 v0, v20, v21 - ; GCN-NEXT: v_pack_b32_f16 v1, v18, v19 - ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72 - ; GCN-NEXT: v_exp_f32_e32 v25, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v80, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 - ; GCN-NEXT: v_fma_f32 v26, s4, v4, -v72 - ; GCN-NEXT: v_exp_f32_e32 v27, v3 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v64 - ; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72 - ; GCN-NEXT: v_exp_f32_e32 v65, v65 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47] - ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2 - ; GCN-NEXT: v_add_f32_e32 v17, v81, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23 - ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72 - ; GCN-NEXT: v_exp_f32_e32 v68, v2 - ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25 + ; GCN-NEXT: ds_write_b32 v57, v31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: ds_read_b128 v[0:3], v84 + ; GCN-NEXT: ds_read_b128 v[28:31], v66 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_pack_b32_f16 v4, v18, v4 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v19 - ; GCN-NEXT: v_exp_f32_e32 v24, v24 - ; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576 + ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v63 + ; GCN-NEXT: v_exp_f32_e32 v85, v3 + ; GCN-NEXT: v_pack_b32_f16 v54, v20, v0 + ; GCN-NEXT: v_pack_b32_f16 v55, v1, v2 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v4 + ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v5 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[54:55], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v6 + ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v81 + ; GCN-NEXT: v_fma_f32 v1, s4, v8, -v63 + ; GCN-NEXT: v_exp_f32_e32 v86, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v82 + ; GCN-NEXT: v_fma_f32 v3, s4, v9, -v63 + ; GCN-NEXT: v_exp_f32_e32 v87, v4 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v84 + ; GCN-NEXT: v_fma_f32 v7, s4, v10, -v63 + ; GCN-NEXT: v_exp_f32_e32 v88, v5 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v85 + ; GCN-NEXT: v_fma_f32 v8, s4, v11, -v63 + ; GCN-NEXT: v_exp_f32_e32 v89, v6 + ; GCN-NEXT: v_pack_b32_f16 v56, v0, v2 + ; GCN-NEXT: v_pack_b32_f16 v57, v4, v5 + ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1 + ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v3 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[28:29], v[56:57], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v86 + ; GCN-NEXT: v_fma_f32 v5, s4, v12, -v63 + ; GCN-NEXT: v_exp_f32_e32 v90, v1 + ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v87 + ; GCN-NEXT: v_fma_f32 v9, s4, v13, -v63 + ; GCN-NEXT: v_exp_f32_e32 v91, v3 + ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v88 + ; GCN-NEXT: v_fma_f32 v11, s4, v14, -v63 + ; GCN-NEXT: v_exp_f32_e32 v92, v6 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v89 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 + ; GCN-NEXT: v_fma_f32 v12, s4, v15, -v63 + ; GCN-NEXT: ds_read_b128 v[20:23], v66 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26 - ; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v17, v82, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27 - ; GCN-NEXT: v_exp_f32_e32 v26, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65 - ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72 - ; GCN-NEXT: v_exp_f32_e32 v67, v67 - ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v17, v83, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68 - ; GCN-NEXT: v_exp_f32_e32 v6, v6 - ; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24 - ; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7 - ; GCN-NEXT: v_exp_f32_e32 v7, v7 - ; GCN-NEXT: v_pack_b32_f16 v4, v28, v29 - ; GCN-NEXT: v_pack_b32_f16 v5, v5, v69 - ; GCN-NEXT: ; implicit-def: $sgpr2 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v0, v85, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26 - ; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v4, v88, v0 - ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 - ; GCN-NEXT: v_exp_f32_e32 v10, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v1, v1, v0 - ; GCN-NEXT: v_pack_b32_f16 v0, v17, v28 - ; GCN-NEXT: s_nop 1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v30, v4 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47] - ; GCN-NEXT: v_add_f32_e32 v0, v31, v2 - ; GCN-NEXT: v_add_f32_e32 v0, v22, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v64, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v23, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v25, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v27, v0 - ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72 - ; GCN-NEXT: v_add_f32_e32 v0, v65, v0 - ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72 - ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v68, v0 - ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72 - ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9 - ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72 - ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72 - ; GCN-NEXT: v_exp_f32_e32 v8, v8 - ; GCN-NEXT: v_add_f32_e32 v0, v24, v0 - ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72 - ; GCN-NEXT: v_exp_f32_e32 v9, v9 - ; GCN-NEXT: v_add_f32_e32 v0, v26, v0 - ; GCN-NEXT: v_add_f32_e32 v0, v67, v0 - ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72 - ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5 - ; GCN-NEXT: v_add_f32_e32 v0, v6, v0 - ; GCN-NEXT: v_exp_f32_e32 v11, v11 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 - ; GCN-NEXT: v_exp_f32_e32 v12, v3 - ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13 - ; GCN-NEXT: v_exp_f32_e32 v17, v1 - ; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14 - ; GCN-NEXT: v_add_f32_e32 v0, v7, v0 - ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9 - ; GCN-NEXT: v_exp_f32_e32 v15, v3 - ; GCN-NEXT: v_exp_f32_e32 v18, v1 - ; GCN-NEXT: v_add_f32_e32 v6, v8, v0 - ; GCN-NEXT: ds_read_b128 v[0:3], v91 + ; GCN-NEXT: ds_read_b128 v[0:3], v67 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10 - ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 - ; GCN-NEXT: v_add_f32_e32 v6, v9, v6 - ; GCN-NEXT: v_pack_b32_f16 v8, v4, v13 - ; GCN-NEXT: v_add_f32_e32 v6, v10, v6 - ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14 - ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18 - ; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63] - ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 - ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v12 - ; GCN-NEXT: v_add_f32_e32 v6, v11, v6 - ; GCN-NEXT: v_add_f32_e32 v6, v12, v6 - ; GCN-NEXT: v_add_f32_e32 v1, v15, v6 - ; GCN-NEXT: v_add_f32_e32 v11, v17, v1 - ; GCN-NEXT: v_pack_b32_f16 v1, v0, v7 - ; GCN-NEXT: v_pack_b32_f16 v0, v4, v10 - ; GCN-NEXT: ds_read_b128 v[4:7], v91 offset:576 + ; GCN-NEXT: v_exp_f32_e32 v63, v8 + ; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v5 + ; GCN-NEXT: v_pack_b32_f16 v28, v4, v7 + ; GCN-NEXT: v_pack_b32_f16 v29, v10, v6 + ; GCN-NEXT: v_exp_f32_e32 v66, v5 + ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v9 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[30:31], v[28:29], v[32:47] + ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v11 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v90 + ; GCN-NEXT: v_exp_f32_e32 v93, v8 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v91 + ; GCN-NEXT: v_exp_f32_e32 v94, v9 + ; GCN-NEXT: v_cvt_f16_f32_e32 v6, v92 + ; GCN-NEXT: v_cvt_f16_f32_e32 v7, v63 + ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v12 + ; GCN-NEXT: v_exp_f32_e32 v95, v11 + ; GCN-NEXT: v_pack_b32_f16 v30, v4, v5 + ; GCN-NEXT: v_pack_b32_f16 v31, v6, v7 + ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v66 + ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v95 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[0:1], v[30:31], v[32:47] + ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v94 + ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v93 + ; GCN-NEXT: ; implicit-def: $sgpr0 + ; GCN-NEXT: v_add_u32_e32 v52, s0, v52 + ; GCN-NEXT: v_pack_b32_f16 v59, v0, v4 + ; GCN-NEXT: v_pack_b32_f16 v58, v5, v1 + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[2:3], v[58:59], v[32:47] + ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: v_mul_f32_e64 v0, v0, v62 + ; GCN-NEXT: v_mul_f32_e64 v1, v1, v62 + ; GCN-NEXT: v_mul_f32_e64 v2, v2, v62 + ; GCN-NEXT: v_mul_f32_e64 v3, v3, v62 + ; GCN-NEXT: v_mul_f32_e64 v4, v4, v62 + ; GCN-NEXT: v_mul_f32_e64 v5, v5, v62 + ; GCN-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[62:63] op_sel_hi:[1,0] + ; GCN-NEXT: s_nop 1 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[64:65], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v16, v68, v76 + ; GCN-NEXT: v_add_f32_e32 v16, v69, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v70, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v71, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v72, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v73, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v75, v16 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[48:49], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[24:25], v[50:51], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v24, v77, v16 + ; GCN-NEXT: ds_read_b128 v[16:19], v67 offset:576 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_inv sc0 sc1 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: s_waitcnt vmcnt(8) ; GCN-NEXT: ;;#ASMEND - ; GCN-NEXT: v_mov_b32_e32 v4, 0 - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47] - ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63] - ; GCN-NEXT: v_add_f32_e32 v2, v18, v11 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[26:27], v[54:55], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[20:21], v[56:57], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[22:23], v[28:29], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[30:31], v[0:15] + ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[18:19], v[58:59], v[0:15] + ; GCN-NEXT: v_add_f32_e32 v20, v80, v24 + ; GCN-NEXT: v_add_f32_e32 v20, v83, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v74, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v60, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v61, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v78, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v79, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v81, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v82, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v84, v20 + ; GCN-NEXT: v_add_f32_e32 v20, v85, v20 + ; GCN-NEXT: v_add_f32_e32 v16, v86, v20 + ; GCN-NEXT: v_add_f32_e32 v16, v87, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v88, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v89, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v90, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v91, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v92, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v63, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v66, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v93, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v94, v16 + ; GCN-NEXT: v_add_f32_e32 v16, v95, v16 + ; GCN-NEXT: ds_bpermute_b32 v17, v53, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_add_f32_e32 v2, v2, v3 - ; GCN-NEXT: ds_bpermute_b32 v3, v66, v2 + ; GCN-NEXT: v_add_f32_e32 v0, v16, v17 + ; GCN-NEXT: ds_bpermute_b32 v0, v53, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) - ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] - ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16 + ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_endpgm attributes #0 = {"amdgpu-flat-work-group-size"="256,256"} diff --git a/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir b/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir new file mode 100644 index 0000000000000..8721c32688571 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/misched-ds-mfma-order-false-deps.mir @@ -0,0 +1,118 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-scheduler -verify-machineinstrs -amdgpu-disable-mfma-chain-order-deps %s -o - 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=machine-scheduler -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=CHAIN + +--- +name: test_fmha_order +tracksRegLiveness: true +body: | + bb.0: + liveins: $exec + ; CHECK-LABEL: name: test_fmha_order + ; CHECK: liveins: $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %addr:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: %c0:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: %c1:vreg_128_align2 = IMPLICIT_DEF + ; CHECK-NEXT: %accA:vreg_512_align2 = IMPLICIT_DEF + ; CHECK-NEXT: %accB:vreg_512_align2 = IMPLICIT_DEF + ; CHECK-NEXT: %accC:vreg_512_align2 = IMPLICIT_DEF + ; CHECK-NEXT: %accD:vreg_512_align2 = IMPLICIT_DEF + ; CHECK-NEXT: SCHED_BARRIER 0, implicit $exec + ; CHECK-NEXT: %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec + ; CHECK-NEXT: %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec + ; CHECK-NEXT: %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec + ; CHECK-NEXT: %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec + ; CHECK-NEXT: %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec + ; CHECK-NEXT: %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec + ; CHECK-NEXT: %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec + ; CHECK-NEXT: %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: dead %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec + ; CHECK-NEXT: SCHED_BARRIER 1, implicit $exec + ; CHECK-NEXT: dead %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit $exec + ; + ; CHAIN-LABEL: name: test_fmha_order + ; CHAIN: liveins: $exec + ; CHAIN-NEXT: {{ $}} + ; CHAIN-NEXT: %addr:vgpr_32 = IMPLICIT_DEF + ; CHAIN-NEXT: %c0:vreg_128_align2 = IMPLICIT_DEF + ; CHAIN-NEXT: %c1:vreg_128_align2 = IMPLICIT_DEF + ; CHAIN-NEXT: %accA:vreg_512_align2 = IMPLICIT_DEF + ; CHAIN-NEXT: %accB:vreg_512_align2 = IMPLICIT_DEF + ; CHAIN-NEXT: %accC:vreg_512_align2 = IMPLICIT_DEF + ; CHAIN-NEXT: %accD:vreg_512_align2 = IMPLICIT_DEF + ; CHAIN-NEXT: SCHED_BARRIER 0, implicit $exec + ; CHAIN-NEXT: %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec + ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec + ; CHAIN-NEXT: %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: dead %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec + ; CHAIN-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec + ; CHAIN-NEXT: dead %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec + ; CHAIN-NEXT: %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: dead %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec + ; CHAIN-NEXT: dead %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec + ; CHAIN-NEXT: %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: dead %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec + ; CHAIN-NEXT: SCHED_BARRIER 1, implicit $exec + ; CHAIN-NEXT: dead %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec + ; CHAIN-NEXT: S_ENDPGM 0, implicit $exec + %addr:vgpr_32 = IMPLICIT_DEF + %c0:vreg_128_align2 = IMPLICIT_DEF + %c1:vreg_128_align2 = IMPLICIT_DEF + %accA:vreg_512_align2 = IMPLICIT_DEF + %accB:vreg_512_align2 = IMPLICIT_DEF + %accC:vreg_512_align2 = IMPLICIT_DEF + %accD:vreg_512_align2 = IMPLICIT_DEF + + SCHED_BARRIER 0, implicit $exec + + %t0a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9216, 0, implicit $exec + %t0b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 9248, 0, implicit $exec + %t1a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11392, 0, implicit $exec + %t1b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 11424, 0, implicit $exec + %t2a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13568, 0, implicit $exec + %t2b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 13600, 0, implicit $exec + %t3a:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15744, 0, implicit $exec + %t3b:vreg_128_align2 = DS_READ_B128_gfx9 %addr, 15776, 0, implicit $exec + + %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub0_sub1, %c0.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec + %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0a.sub2_sub3, %c0.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec + + %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub0_sub1, %c0.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec + %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1a.sub2_sub3, %c0.sub2_sub3, %accB, 0, 0, 0, implicit $mode, implicit $exec + + %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub0_sub1, %c0.sub0_sub1, %accC, 0, 0, 0, implicit $mode, implicit $exec + %accC:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t2a.sub2_sub3, %c0.sub2_sub3, %accC, 0, 0, 0, implicit $mode, implicit $exec + + %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub0_sub1, %c0.sub0_sub1, %accD, 0, 0, 0, implicit $mode, implicit $exec + %accD:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t3a.sub2_sub3, %c0.sub2_sub3, %accD, 0, 0, 0, implicit $mode, implicit $exec + + %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub0_sub1, %c1.sub0_sub1, %accA, 0, 0, 0, implicit $mode, implicit $exec + %accA:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t0b.sub2_sub3, %c1.sub2_sub3, %accA, 0, 0, 0, implicit $mode, implicit $exec + + %accB:vreg_512_align2 = V_MFMA_F32_32X32X8BF16_1K_mac_vgprcd_e64 %t1b.sub0_sub1, %c1.sub0_sub1, %accB, 0, 0, 0, implicit $mode, implicit $exec + SCHED_BARRIER 1, implicit $exec + S_ENDPGM 0, implicit $exec + +... + +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHAIN: {{.*}} +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll index e29be2b744874..77dc5a3acbd2f 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll @@ -369,74 +369,71 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_mov_b32 s5, s4 -; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[4:5] +; CHECK-NEXT: v_mov_b64_e32 v[6:7], s[4:5] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[16:19] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mov_b64_e32 v[4:5], 0 ; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; CHECK-NEXT: s_mov_b32 s0, 0x3c003c00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3] -; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[0:3] +; CHECK-NEXT: v_mov_b64_e32 v[20:21], s[0:1] ; CHECK-NEXT: s_mov_b32 s0, 0x7e007e00 ; CHECK-NEXT: s_mov_b32 s1, s0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7] -; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[0:1] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9] -; CHECK-NEXT: s_nop 3 -; CHECK-NEXT: v_cvt_f16_f32_e32 v24, v4 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3] -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_mov_b32_e32 v8, 0x7fc00000 -; CHECK-NEXT: v_mov_b32_e32 v9, v8 -; CHECK-NEXT: v_mov_b32_e32 v10, v8 -; CHECK-NEXT: v_mov_b32_e32 v11, v8 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v6 -; CHECK-NEXT: v_mov_b64_e32 v[0:1], 0 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11] -; CHECK-NEXT: global_store_short v[0:1], v2, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[8:11] +; CHECK-NEXT: v_mov_b64_e32 v[22:23], s[0:1] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[20:21], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[10:13] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[6:7], v[22:23], v[0:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def v[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_nop 5 +; CHECK-NEXT: v_mov_b32_e32 v12, 0x7fc00000 +; CHECK-NEXT: v_mov_b32_e32 v13, v12 +; CHECK-NEXT: v_mov_b32_e32 v14, v12 +; CHECK-NEXT: v_mov_b32_e32 v15, v12 +; CHECK-NEXT: v_cvt_f16_f32_e32 v9, v10 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[6:7], v[6:7], v[16:19] +; CHECK-NEXT: global_store_short v[4:5], v9, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[12:15] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11] -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19] -; CHECK-NEXT: s_nop 5 -; CHECK-NEXT: v_cvt_f16_f32_e32 v10, v6 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15] -; CHECK-NEXT: global_store_short v[0:1], v10, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[10:13], v[6:7], v[6:7], v[10:13] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v14, v16 +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: v_cvt_f16_f32_e32 v12, v8 +; CHECK-NEXT: v_cvt_f16_f32_e32 v13, v10 +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[0:3] +; CHECK-NEXT: global_store_short v[4:5], v13, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[20:21], v[0:3] ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: s_nop 1 -; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CHECK-NEXT: global_store_short v[0:1], v6, off -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23] +; CHECK-NEXT: global_store_short v[4:5], v14, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: global_store_short v[0:1], v24, off +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[6:7], v[6:7], v[8:11] +; CHECK-NEXT: global_store_short v[4:5], v12, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[6:7], v[0:3] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[8:11], v[22:23], v[6:7], v[8:11] +; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[20:21], v[6:7], v[0:3] ; CHECK-NEXT: s_nop 6 -; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v2 -; CHECK-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19] -; CHECK-NEXT: global_store_short v[0:1], v6, off +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v8 +; CHECK-NEXT: global_store_short v[4:5], v0, off ; CHECK-NEXT: buffer_wbl2 sc0 sc1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: buffer_inv sc0 sc1 -; CHECK-NEXT: s_nop 2 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CHECK-NEXT: global_store_short v[0:1], v2, off +; CHECK-NEXT: global_store_short v[4:5], v1, off ; CHECK-NEXT: s_endpgm entry: %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()