diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp index 90cb044285bae..f9d4a5ba86f7a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUWaveTransform.cpp @@ -1909,7 +1909,7 @@ void ControlFlowRewriter::rewrite() { Opcode = AMDGPU::S_CBRANCH_SCC1; } else { Register CondReg = Info.OrigCondition; - bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(CondReg, *Node->Block); + bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(CondReg, *Node->Block, Node->Block->end()); LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << Node->Block->name() << ") : " << isCondRegSubsetOfExec << "\n"); if (!isCondRegSubsetOfExec) { @@ -1951,7 +1951,6 @@ void ControlFlowRewriter::rewrite() { RegMap; GCNLaneMaskUpdater Updater(Function); Updater.setLaneMaskAnalysis(&LMA); - Updater.setAccumulating(true); for (WaveNode *LaneTarget : NodeOrder) { LLVM_DEBUG(dbgs() << "\nPROCESSING NODE:" << LaneTarget->printableName() << "\n\n"); @@ -1973,7 +1972,7 @@ void ControlFlowRewriter::rewrite() { // Step 2.1: Add conditions branching to LaneTarget to the Lane mask // Updater. // FIXME: we are creating a register here only to initialize the updater - Updater.init(LMU.createLaneMaskReg()); + Updater.init(); Updater.addReset(*LaneTarget->Block, GCNLaneMaskUpdater::ResetInMiddle); LLVM_DEBUG(dbgs() << "\nMark ResetInMiddle(X): " << LaneTarget->printableName() << '\n'); for (const auto &NodeDivergentPair : LaneTargetInfo.OriginBranch) { @@ -2023,7 +2022,7 @@ void ControlFlowRewriter::rewrite() { } } else { CondReg = LaneOrigin.CondReg; - bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block); + bool isCondRegSubsetOfExec = LMA.isSubsetOfExec(LaneOrigin.CondReg, *LaneOrigin.Node->Block, LaneOrigin.Node->Block->getFirstTerminator()); LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(LaneOrigin.CondReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << LaneOrigin.Node->Block->name() << ") : " << isCondRegSubsetOfExec << "\n"); if (!isCondRegSubsetOfExec) { Register Prev = CondReg; @@ -2120,7 +2119,7 @@ void ControlFlowRewriter::rewrite() { LLVM_DEBUG(dbgs() << "\nRejoin @ " << Secondary->printableName() << '\n'); Secondary->dump(); // FIXME: we are creating a register here only to initialize the updater - Updater.init(LMU.createLaneMaskReg()); + Updater.init(); Updater.addReset(*Secondary->Block, GCNLaneMaskUpdater::ResetInMiddle); LLVM_DEBUG(dbgs() << "\nMark ResetInMiddle(X): " << Secondary->printableName() << '\n'); @@ -2132,32 +2131,32 @@ void ControlFlowRewriter::rewrite() { Register PrimaryExec = PredInfo.PrimarySuccessorExec; LLVM_DEBUG(dbgs() << "Pred:" << Pred->Block->name() << "\nPrimaryExec:" << printReg(PrimaryExec,MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); - MachineInstr *PrimaryExecDef; - for (;;) { - PrimaryExecDef = MRI.getVRegDef(PrimaryExec); - if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) - break; - PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); - } + // MachineInstr *PrimaryExecDef; + // for (;;) { + // PrimaryExecDef = MRI.getVRegDef(PrimaryExec); + // if (PrimaryExecDef->getOpcode() != AMDGPU::COPY) + // break; + // PrimaryExec = PrimaryExecDef->getOperand(1).getReg(); + // } - LLVM_DEBUG(dbgs() << "PrimaryExecDef:"); - LLVM_DEBUG(PrimaryExecDef->dump()); - LLVM_DEBUG(dbgs() << "\n"); + // LLVM_DEBUG(dbgs() << "PrimaryExecDef:"); + // LLVM_DEBUG(PrimaryExecDef->dump()); + // LLVM_DEBUG(dbgs() << "\n"); // Rejoin = EXEC ^ PrimaryExec // // Fold immediately if PrimaryExec was obtained via XOR as well. Register Rejoin; - if (PrimaryExecDef->getParent() == Pred->Block && - PrimaryExecDef->getOpcode() == LMC.XorOpc && - PrimaryExecDef->getOperand(1).isReg() && - PrimaryExecDef->getOperand(2).isReg()) { - if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) - Rejoin = PrimaryExecDef->getOperand(2).getReg(); - else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) - Rejoin = PrimaryExecDef->getOperand(1).getReg(); - } + // if (PrimaryExecDef->getParent() == Pred->Block && + // PrimaryExecDef->getOpcode() == LMC.XorOpc && + // PrimaryExecDef->getOperand(1).isReg() && + // PrimaryExecDef->getOperand(2).isReg()) { + // if (PrimaryExecDef->getOperand(1).getReg() == LMC.ExecReg) + // Rejoin = PrimaryExecDef->getOperand(2).getReg(); + // else if (PrimaryExecDef->getOperand(2).getReg() == LMC.ExecReg) + // Rejoin = PrimaryExecDef->getOperand(1).getReg(); + // } if (!Rejoin) { // Try to find a previously generated XOR (or merely masked) value @@ -2199,6 +2198,16 @@ void ControlFlowRewriter::rewrite() { } + Updater.insertAccumulatorResets(); + // Replace all MovTermOpc operations with equivalent MovOpc operations. + + for (MachineBasicBlock &MBB : Function) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == LMC.MovTermOpc) { + MI.setDesc(TII.get(LMC.MovOpc)); + } + } + } Updater.cleanup(); LLVM_DEBUG(dbgs() << "CFG_BEGIN:" << Function.getName().str() << "_clean\n"); diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp index 8bc1e7a552d4c..22cbf8950d645 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.cpp @@ -10,10 +10,12 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" + using namespace llvm; /// Check whether the register could be a lane-mask register. @@ -31,47 +33,70 @@ bool GCNLaneMaskUtils::maybeLaneMask(Register Reg) const { /// Determine whether the lane-mask register \p Reg is a wave-wide constant. /// If so, the value is stored in \p Val. -bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val) const { +bool GCNLaneMaskUtils::isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { MachineRegisterInfo &MRI = MF.getRegInfo(); + LLVM_DEBUG(dbgs() << "isConstantLaneMask(" << printReg(Reg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << MBB.name() << ") : \n"); + LLVM_DEBUG(dbgs() << "MI:"); + if(MI != MBB.end()) MI->dump(); + else LLVM_DEBUG(dbgs() << "end of block"); + LLVM_DEBUG(dbgs() << "\n"); - const MachineInstr *MI; for (;;) { - MI = MRI.getVRegDef(Reg); - if (!MI) { + MI = SIRegisterInfo::getDomVRegDefInBasicBlock(Reg, MBB, MI, + MRI.getTargetRegisterInfo()); + if (MI == MBB.end()) { // This can happen when called from GCNLaneMaskUpdater, where Reg can // be a placeholder that has not yet been filled in. + LLVM_DEBUG(dbgs() << "MI == MBB.end(), return false\n"); return false; } - if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF) + LLVM_DEBUG(dbgs() << "MI:"); + MI->dump(); + LLVM_DEBUG(dbgs() << "\n"); + + if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF){ + LLVM_DEBUG(dbgs() << "MI->getOpcode() == AMDGPU::IMPLICIT_DEF, return true;\n"); return true; + } if (MI->getOpcode() != AMDGPU::COPY) break; Reg = MI->getOperand(1).getReg(); - if (!Register::isVirtualRegister(Reg)) - return false; - if (!maybeLaneMask(Reg)) - return false; + if (!Register::isVirtualRegister(Reg)){ + LLVM_DEBUG(dbgs() << "!Register::isVirtualRegister(Reg), return false\n"); + return false;} + if (!maybeLaneMask(Reg)){ + LLVM_DEBUG(dbgs() << "!maybeLaneMask(Reg), return false\n"); + return false;} } - if (MI->getOpcode() != LMC.MovOpc) - return false; + LLVM_DEBUG(dbgs() << "MI after loop:"); + MI->dump(); + LLVM_DEBUG(dbgs() << "\n"); + + if (MI->getOpcode() != LMC.MovOpc){ + LLVM_DEBUG(dbgs() << "MI->getOpcode() != LMC.MovOpc, return false\n"); + return false;} - if (!MI->getOperand(1).isImm()) - return false; + if (!MI->getOperand(1).isImm()){ + LLVM_DEBUG(dbgs() << "!MI->getOperand(1).isImm(), return false\n"); + return false;} int64_t Imm = MI->getOperand(1).getImm(); if (Imm == 0) { + LLVM_DEBUG(dbgs() << "Imm == 0, Val = false, return true\n"); Val = false; return true; } if (Imm == -1) { + LLVM_DEBUG(dbgs() << "Imm == -1, Val = true, return true\n"); Val = true; return true; } + LLVM_DEBUG(dbgs() << "End of isConstantLaneMask, return false\n"); return false; } @@ -86,8 +111,7 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// DstReg = (PrevReg & ~EXEC) | (CurReg & EXEC) /// /// before \p I in basic block \p MBB. Some simplifications are applied on the -/// fly based on constant inputs and analysis via \p LMA, and further -/// simplifications can be requested in "accumulating" mode. +/// fly based on constant inputs and analysis via \p LMA /// /// \param DstReg The virtual register into which the merged mask is written. /// \param PrevReg The virtual register with the "previous" lane mask value; @@ -96,49 +120,49 @@ Register GCNLaneMaskUtils::createLaneMaskReg() const { /// be merged into "previous". /// \param LMA If non-null, used to test whether CurReg may already be a subset /// of EXEC. -/// \param accumulating Indicates that we should assume PrevReg is already -/// properly masked, i.e. use PrevReg directly instead of -/// (PrevReg & ~EXEC), and don't add extra 1-bits to DstReg -/// beyond (CurReg & EXEC). void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA, - bool accumulating) const { + bool isPrevZeroReg) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool PrevVal = false; - bool PrevConstant = !PrevReg || isConstantLaneMask(PrevReg, PrevVal); + bool PrevConstant = !PrevReg || isPrevZeroReg; bool CurVal = false; - bool CurConstant = isConstantLaneMask(CurReg, CurVal); + bool CurConstant = isConstantLaneMask(CurReg, CurVal, MBB, I); MachineRegisterInfo &MRI = MF.getRegInfo(); Printable destRegPrintable = printReg(DstReg , MRI.getTargetRegisterInfo(), 0, &MRI); Printable curRegPrintable = printReg(CurReg , MRI.getTargetRegisterInfo(), 0, &MRI); Printable prevRegPrintable = printReg(PrevReg , MRI.getTargetRegisterInfo(), 0, &MRI); - dbgs() << "\t\tGCNLaneMaskUtils::buildMergeLaneMasks(" << MBB.name() << ",...):\n"; - dbgs() << "\t\t DstReg : BlockInfo.Merged : " << destRegPrintable << "\n"; - dbgs() << "\t\t PrevReg : Previous : " << prevRegPrintable << "\n"; - dbgs() << "\t\t CurReg : BlockInfo.Value : " << curRegPrintable << "\n"; - dbgs() << "\t\t Create instr : " << destRegPrintable << " = (" << prevRegPrintable << " & ~EXEC) | (" << curRegPrintable << " & EXEC) : \n"; - dbgs() << "\t\tPrevConstant:" << PrevConstant << " CurConstant:" << CurConstant << "\n"; - dbgs() << "\t\tPrevVal:" << PrevVal << " CurVal:" << CurVal << "\n"; - - assert(PrevReg || !accumulating); + LLVM_DEBUG(dbgs() << "\t\tGCNLaneMaskUtils::buildMergeLaneMasks(" << MBB.name() << ",...):\n"); + LLVM_DEBUG(dbgs() << "\t\t DstReg : BlockInfo.Merged : " << destRegPrintable << "\n"); + LLVM_DEBUG(dbgs() << "\t\t PrevReg : Previous : " << prevRegPrintable << "\n"); + LLVM_DEBUG(dbgs() << "\t\t CurReg : BlockInfo.Value : " << curRegPrintable << "\n"); + LLVM_DEBUG(dbgs() << "\t\t Create instr : " << destRegPrintable << " = (" << prevRegPrintable << " & ~EXEC) | (" << curRegPrintable << " & EXEC) : \n"); + LLVM_DEBUG(dbgs() << "\t\tPrevConstant:" << PrevConstant << " CurConstant:" << CurConstant << "\n"); + LLVM_DEBUG(dbgs() << "\t\tPrevVal:" << PrevVal << " CurVal:" << CurVal << "\n"); + LLVM_DEBUG(dbgs() << "\t\tIterator I:"); + if(I != MBB.end()) I->dump(); + else LLVM_DEBUG(dbgs() << "end of block"); + LLVM_DEBUG(dbgs() << "\n"); + + assert(PrevReg); if (PrevConstant && CurConstant) {// is wave wide constant? if (PrevVal == CurVal) { - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg)->dump(); } else if (CurVal) { // If PrevReg is undef, prefer to propagate a full constant. - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) .addReg(PrevReg ? LMC.ExecReg : CurReg)->dump(); } else { - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(LMC.XorOpc), DstReg) .addReg(LMC.ExecReg) .addImm(-1)->dump(); @@ -151,26 +175,16 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, Register PrevMaskedReg; Register CurMaskedReg; if (!PrevConstant) { - if (accumulating || (CurConstant && CurVal)) { - PrevMaskedReg = PrevReg; - } else { - PrevMaskedReg = createLaneMaskReg(); - dbgs() << "\t "; - PrevMaskedBuilt = - BuildMI(MBB, I, DL, TII->get(LMC.AndN2Opc), PrevMaskedReg) - .addReg(PrevReg) - .addReg(LMC.ExecReg); - PrevMaskedBuilt->dump(); - } + PrevMaskedReg = PrevReg; } if (!CurConstant) { - bool isCurRegSubsetOfExec = LMA && LMA->isSubsetOfExec(CurReg, MBB); - dbgs() << "isSubsetOfExec(" << printReg(CurReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << MBB.name() << ") : " << isCurRegSubsetOfExec << "\n"; + bool isCurRegSubsetOfExec = LMA && LMA->isSubsetOfExec(CurReg, MBB, I); + LLVM_DEBUG(dbgs() << "isSubsetOfExec(" << printReg(CurReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "," << MBB.name() << ") : " << isCurRegSubsetOfExec << "\n"); if ((PrevConstant && PrevVal) || isCurRegSubsetOfExec) { CurMaskedReg = CurReg; } else { CurMaskedReg = createLaneMaskReg(); - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); CurMaskedBuilt = BuildMI(MBB, I, DL, TII->get(LMC.AndOpc), CurMaskedReg) .addReg(CurReg) .addReg(LMC.ExecReg); @@ -184,33 +198,33 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, if (PrevConstant && !PrevVal) { if (CurMaskedBuilt) { CurMaskedBuilt->getOperand(0).setReg(DstReg); - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); CurMaskedBuilt->dump(); } else { - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurMaskedReg)->dump(); } } else if (CurConstant && !CurVal) { if (PrevMaskedBuilt) { PrevMaskedBuilt->getOperand(0).setReg(DstReg); - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); PrevMaskedBuilt->dump(); } else { - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(PrevMaskedReg)->dump(); } } else if (PrevConstant && PrevVal) { - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(LMC.OrN2Opc), DstReg) .addReg(CurMaskedReg) .addReg(LMC.ExecReg)->dump(); } else { - dbgs() << "\t "; + LLVM_DEBUG(dbgs() << "\t "); BuildMI(MBB, I, DL, TII->get(LMC.OrOpc), DstReg) .addReg(PrevMaskedReg) .addReg(CurMaskedReg ? CurMaskedReg : LMC.ExecReg)->dump(); } - dbgs() << "\t\tGCNLaneMaskUtils::buildMergeLaneMasks() ends\n"; + LLVM_DEBUG(dbgs() << "\t\tGCNLaneMaskUtils::buildMergeLaneMasks() ends\n"); } /// Conservatively determine whether the \p Reg is a subset of EXEC for @@ -218,99 +232,130 @@ void GCNLaneMaskUtils::buildMergeLaneMasks(MachineBasicBlock &MBB, /// (Reg & EXEC) == Reg when used in \p UseBlock. bool GCNLaneMaskAnalysis::isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + MachineBasicBlock::iterator I, unsigned RemainingDepth) { MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); - MachineInstr *DefInstr = nullptr; + MachineBasicBlock::iterator DefInstr = UseBlock.end(); const AMDGPU::LaneMaskConstants &LMC = LMU.getLaneMaskConsts(); for (;;) { if (!Register::isVirtualRegister(Reg)) { if (Reg == LMC.ExecReg && - (!DefInstr || DefInstr->getParent() == &UseBlock)) - return true; + (DefInstr == UseBlock.end() || DefInstr->getParent() == &UseBlock)){ + LLVM_DEBUG(dbgs() << "Reg is EXEC in same BB, return true\n"); + return true;} + LLVM_DEBUG(dbgs() << "Reg is not EXEC or is in other BB, return false\n"); return false; } - DefInstr = MRI.getVRegDef(Reg); + DefInstr = SIRegisterInfo::getDomVRegDefInBasicBlock( + Reg, UseBlock, I, MRI.getTargetRegisterInfo()); + if (DefInstr == UseBlock.end()) { + LLVM_DEBUG(dbgs() << "DefInstr == UseBlock.end(), return false\n"); + return false;} if (DefInstr->getOpcode() == AMDGPU::COPY) { Reg = DefInstr->getOperand(1).getReg(); + I = DefInstr; continue; } if (DefInstr->getOpcode() == LMC.MovOpc) { if (DefInstr->getOperand(1).isImm() && - DefInstr->getOperand(1).getImm() == 0) - return true; + DefInstr->getOperand(1).getImm() == 0){ + LLVM_DEBUG(dbgs() << "MOV 0, return true\n"); + return true;} + LLVM_DEBUG(dbgs() << "MOV is not imm or not 0, return false\n"); return false; } break; } - if (DefInstr->getParent() != &UseBlock) - return false; + LLVM_DEBUG(dbgs() << "DefInstr:"); + DefInstr->dump(); + LLVM_DEBUG(dbgs() << "\n"); + + if (DefInstr->getParent() != &UseBlock){ + LLVM_DEBUG(dbgs() << "DefInstr->getParent() != &UseBlock, return false\n"); + return false;} auto CacheIt = SubsetOfExec.find(Reg); - if (CacheIt != SubsetOfExec.end()) + if (CacheIt != SubsetOfExec.end()){ + LLVM_DEBUG(dbgs() << "CacheIt != SubsetOfExec.end(), return CacheIt->second: " << CacheIt->second << " \n"); return CacheIt->second; + } // V_CMP_xx always return a subset of EXEC. if (DefInstr->isCompare() && (SIInstrInfo::isVOPC(*DefInstr) || SIInstrInfo::isVOP3(*DefInstr))) { SubsetOfExec[Reg] = true; + LLVM_DEBUG(dbgs() << "DefInstr is VOPC or VOP3, return true\n"); return true; } if (!RemainingDepth--) - return false; + {LLVM_DEBUG(dbgs() << "RemainingDepth-- is 0, return false\n"); + return false;} bool LikeOr = DefInstr->getOpcode() == LMC.OrOpc || DefInstr->getOpcode() == LMC.XorOpc || DefInstr->getOpcode() == LMC.CSelectOpc; bool IsAnd = DefInstr->getOpcode() == LMC.AndOpc; bool IsAndN2 = DefInstr->getOpcode() == LMC.AndN2Opc; + LLVM_DEBUG(dbgs() << "LikeOr: " << LikeOr << " IsAnd: " << IsAnd << " IsAndN2: " << IsAndN2 << "\n"); if ((LikeOr || IsAnd || IsAndN2) && (DefInstr->getOperand(1).isReg() && DefInstr->getOperand(2).isReg())) { bool FirstIsSubset = isSubsetOfExec(DefInstr->getOperand(1).getReg(), - UseBlock, RemainingDepth); - if (!FirstIsSubset && (LikeOr || IsAndN2)) - return SubsetOfExec.try_emplace(Reg, false).first->second; + UseBlock, DefInstr, RemainingDepth); + + LLVM_DEBUG(dbgs() << "FirstIsSubset: " << FirstIsSubset << "\n"); + + if (!FirstIsSubset && (LikeOr || IsAndN2)){ + bool res = SubsetOfExec.try_emplace(Reg, false).first->second; + LLVM_DEBUG(dbgs() << "FirstIsSubset is false and (LikeOr || IsAndN2), return res: " << res << "\n"); + return res;} if (FirstIsSubset && (IsAnd || IsAndN2)) { SubsetOfExec[Reg] = true; + LLVM_DEBUG(dbgs() << "FirstIsSubset is true and (IsAnd || IsAndN2), return true\n"); return true; } bool SecondIsSubset = isSubsetOfExec(DefInstr->getOperand(2).getReg(), - UseBlock, RemainingDepth); - if (!SecondIsSubset) - return SubsetOfExec.try_emplace(Reg, false).first->second; + UseBlock, DefInstr, RemainingDepth); + LLVM_DEBUG(dbgs() << "SecondIsSubset: " << SecondIsSubset << "\n"); + if (!SecondIsSubset){ + bool res = SubsetOfExec.try_emplace(Reg, false).first->second; + LLVM_DEBUG(dbgs() << "SecondIsSubset is false, return res: " << res << "\n"); + return res;} SubsetOfExec[Reg] = true; + LLVM_DEBUG(dbgs() << "SecondIsSubset is true, return true\n"); return true; } + LLVM_DEBUG(dbgs() << "Enod of function ,return false\n"); return false; } /// Initialize the updater. -void GCNLaneMaskUpdater::init(Register Reg) { +void GCNLaneMaskUpdater::init() { Processed = false; Blocks.clear(); // SSAUpdater.Initialize(LMU.getLaneMaskConsts().LaneMaskRC); - SSAUpdater.Initialize(Reg); + Accumulator = AMDGPU::NoRegister; } /// Optional cleanup, may remove stray instructions. void GCNLaneMaskUpdater::cleanup() { Processed = false; Blocks.clear(); - + Accumulator = AMDGPU::NoRegister; MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); if (ZeroReg && MRI.use_empty(ZeroReg)) { MRI.getVRegDef(ZeroReg)->eraseFromParent(); - ZeroReg = {}; + ZeroReg = AMDGPU::NoRegister; } for (MachineInstr *MI : PotentiallyDead) { @@ -351,7 +396,7 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block, BlockIt = Blocks.end() - 1; } assert(!BlockIt->Value); - dbgs() << "GCNLaneMaskUpdater::addAvailable(" << Block.name() << "," << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::addAvailable(" << Block.name() << "," << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"); BlockIt->Value = Value; } @@ -359,53 +404,53 @@ void GCNLaneMaskUpdater::addAvailable(MachineBasicBlock &Block, /// Return the value in the middle of the block, i.e. before any change that /// was registered via \ref addAvailable. Register GCNLaneMaskUpdater::getValueInMiddleOfBlock(MachineBasicBlock &Block) { - dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << ")\n"; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << ")\n"); if (!Processed) process(); - Register reg = SSAUpdater.GetValueInMiddleOfBlock(&Block); - dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + Register reg = Accumulator; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueInMiddleOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"); return reg; } /// Return the value at the end of the given block, i.e. after any change that /// was registered via \ref addAvailable. /// -/// Note: If \p Block is the reset block in accumulating mode with ResetAtEnd +/// Note: If \p Block is the reset block with ResetAtEnd /// reset mode, then this value will be 0. You likely want /// \ref getPreReset instead. Register GCNLaneMaskUpdater::getValueAtEndOfBlock(MachineBasicBlock &Block) { - dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << ")\n"; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << ")\n"); if (!Processed) process(); - Register reg = SSAUpdater.GetValueAtEndOfBlock(&Block); - dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + Register reg = Accumulator; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueAtEndOfBlock(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"); return reg; } /// Return the value in \p Block after the value merge (if any). Register GCNLaneMaskUpdater::getValueAfterMerge(MachineBasicBlock &Block) { - dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << ")\n"; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << ")\n"); if (!Processed) process(); - Register reg = {}; + Register reg = AMDGPU::NoRegister; auto BlockIt = findBlockInfo(Block); if (BlockIt != Blocks.end()) { - if (BlockIt->Merged){ - reg = BlockIt->Merged; - dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning Merged.\n"; + if (BlockIt->Value){ + reg = Accumulator; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning Merged.\n"); return reg; } if (BlockIt->Flags & ResetInMiddle){ reg = ZeroReg; - dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning ZeroReg.\n"; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ") returning ZeroReg.\n"); return reg; } } // We didn't merge anything in the block, but the block may still be // ResetAtEnd, in which case we need the pre-reset value. - reg = SSAUpdater.GetValueInMiddleOfBlock(&Block); - dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"; + reg = Accumulator; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::getValueAfterMerge(" << Block.name() << "," << printReg(reg, MRI.getTargetRegisterInfo(), 0, &MRI) << ")\n"); return reg; } @@ -453,113 +498,122 @@ getSaluInsertionAtEnd(MachineBasicBlock &MBB) { llvm_unreachable("SCC used by terminator but no def in block"); } +void GCNLaneMaskUpdater::insertAccumulatorResets() { + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::insertAccumulatorResets()\n"); + const SIInstrInfo *TII = LMU.function()->getSubtarget().getInstrInfo(); + for (auto &Entry : AccumulatorResetBlocks) { + MachineBasicBlock *B = Entry.first; + DenseSet &Accumulators = Entry.second; + for (Register ACC : Accumulators) { + //get first branch instruction + MachineBasicBlock::iterator I = B->getFirstTerminator(); + while(I != B->end() && !I->isBranch()) I++; + if(I == B->end()) I--; + LLVM_DEBUG(dbgs() << " Resetting accumulator: " << printReg(ACC, MRI.getTargetRegisterInfo(), 0, &MRI) << "@" << B->name()<< "\n"); + LLVM_DEBUG(dbgs() << " insertion point:"); + if(I == B->end()) + LLVM_DEBUG(dbgs() << " end of block"); + else + I->dump(); + LLVM_DEBUG(dbgs() << "\n"); + BuildMI(*B, I, {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ACC).addImm(0)->dump(); + } + } +} + /// Internal method to insert merge instructions. void GCNLaneMaskUpdater::process() { - dbgs() << "\n\tGCNLaneMaskUpdater::process() begins\n"; + LLVM_DEBUG(dbgs() << "\n\tGCNLaneMaskUpdater::process() begins\n"); MachineRegisterInfo &MRI = LMU.function()->getRegInfo(); const SIInstrInfo *TII = LMU.function()->getSubtarget().getInstrInfo(); MachineBasicBlock &Entry = LMU.function()->front(); - // Prepare an all-zero value for the default and reset in accumulating mode. - if (Accumulating && !ZeroReg) { + if (!ZeroReg) { ZeroReg = LMU.createLaneMaskReg(); BuildMI(Entry, Entry.getFirstTerminator(), {}, TII->get(LMU.getLaneMaskConsts().MovOpc), ZeroReg) .addImm(0); } - dbgs() << "\tZeroReg:" << printReg(ZeroReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; - dbgs() << "\n\tAdding available values:\n"; + LLVM_DEBUG(dbgs() << "\tZeroReg:" << printReg(ZeroReg, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); + LLVM_DEBUG(dbgs() << "\n\tAdding available values:\n"); + + if (!Accumulator) { + Accumulator = LMU.createLaneMaskReg(); + LLVM_DEBUG(dbgs() << "\tCreating Accumulator:" << printReg(Accumulator, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); + BuildMI(Entry, Entry.getFirstTerminator(), {}, + TII->get(LMU.getLaneMaskConsts().MovOpc), Accumulator) + .addImm(0); + } + LLVM_DEBUG(dbgs() << "\n\tMachineSSAUpdater ready, begin merging\n"); // Add available values. for (BlockInfo &Info : Blocks) { - dbgs() << "\tAdd avail value for BlockInfo:" << Info.Block->name() << "\n\t"; - assert(Accumulating || !Info.Flags); + LLVM_DEBUG(dbgs() << "\tAdd avail value for BlockInfo:" << Info.Block->name() << "\n\t"); assert(Info.Flags || Info.Value); - - if (Info.Value){ - Info.Merged = LMU.createLaneMaskReg(); - dbgs() << "creating Info.Merged:" << printReg(Info.Merged, MRI.getTargetRegisterInfo(), 0, &MRI) << " for block " << Info.Block->name() << "\n\t"; - } - Info.dump(MRI); - //Info.Value and not ResetAtEnd, then Info.Merged, else ZeroReg - Register val = (Info.Value && !(Info.Flags & ResetAtEnd)) ? Info.Merged : ZeroReg; - dbgs() << "\t\t(Info.Value && !(Info.Flags & ResetAtEnd)) : " << (Info.Value && !(Info.Flags & ResetAtEnd)) << " => "; - if((Info.Value && !(Info.Flags & ResetAtEnd))) - dbgs() << "Info.Merged\n"; - else - dbgs() << "ZeroReg\n"; - SSAUpdater.AddAvailableValue(Info.Block,val); - dbgs() << "\n"; - - } - - if (Accumulating && !SSAUpdater.HasValueForBlock(&Entry)){ - dbgs() << "\tAdd avail value for Entry block : ZeroReg\n"; - SSAUpdater.AddAvailableValue(&Entry, ZeroReg); + if(!Info.Value || (Info.Flags & ResetAtEnd)){ + LLVM_DEBUG(dbgs() << " !Info.Value || (Info.Flags & ResetAtEnd) is true\n"); + LLVM_DEBUG(dbgs() << " AccumulatorResetBlocks[" << Info.Block->name() << "]:" << printReg(Accumulator, MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); + AccumulatorResetBlocks[Info.Block].insert(Accumulator); + } } - - dbgs() << "\n\tMachineSSAUpdater ready, begin merging\n"; - - + // Once the SSA updater is ready, we can fill in all merge code, relying // on the SSA updater to insert required PHIs. for (BlockInfo &Info : Blocks) { if (!Info.Value) continue; - dbgs() << "\tmerge "; + LLVM_DEBUG(dbgs() << "\tmerge "); Info.dump(MRI); - dbgs() << "\n"; + LLVM_DEBUG(dbgs() << "\n"); // Determine the "previous" value, if any. Register Previous; if (Info.Block != &LMU.function()->front() && !(Info.Flags & ResetInMiddle)) { - Previous = SSAUpdater.GetValueInMiddleOfBlock(Info.Block); - if (Accumulating) { - assert(!MRI.getVRegDef(Previous) || - MRI.getVRegDef(Previous)->getOpcode() != AMDGPU::IMPLICIT_DEF); - } else { - MachineInstr *PrevInstr = MRI.getVRegDef(Previous); - if (PrevInstr && PrevInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) { - PotentiallyDead.insert(PrevInstr); - Previous = {}; - } - } + Previous = Accumulator; } else { - dbgs() << "\tEither one of the following 2 conds are true:\n"; - dbgs() << "\tInfo.Block == &LMU.function()->front():" << (Info.Block == &LMU.function()->front()) << "\n"; - dbgs() << "\tInfo.Flags & ResetInMiddle:" << (Info.Flags & ResetInMiddle) << "\n"; - if (Accumulating){ - Previous = ZeroReg; - dbgs() << "\tBlock:" << Info.Block->name() << " Previous is ZeroReg:" << printReg(Previous , MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"; - } + LLVM_DEBUG(dbgs() << "\tEither one of the following 2 conds are true:\n"); + LLVM_DEBUG(dbgs() << "\tInfo.Block == &LMU.function()->front():" << (Info.Block == &LMU.function()->front()) << "\n"); + LLVM_DEBUG(dbgs() << "\tInfo.Flags & ResetInMiddle:" << (Info.Flags & ResetInMiddle) << "\n"); + LLVM_DEBUG(dbgs() << "\tBlock:" << Info.Block->name() << " Previous is ZeroReg:" << printReg(Previous , MRI.getTargetRegisterInfo(), 0, &MRI) << "\n"); + Previous = ZeroReg; + } // Insert merge logic. MachineBasicBlock::iterator insertPt = getSaluInsertionAtEnd(*Info.Block); - LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Info.Merged, Previous, - Info.Value, LMA, Accumulating); - - if (Info.Flags & ResetAtEnd) { + LMU.buildMergeLaneMasks(*Info.Block, insertPt, {}, Accumulator, Previous, + Info.Value, LMA, (Previous == ZeroReg)); + + + /*if (Info.Flags & ResetAtEnd) { + // We enter this if block if Info.Block is Ti and Ri + // Here we check if Accumulator was set by a simple copy, if so, we use the corresponding register + // This is a copy propogation optimization. + // It depends on getting the latest def of Accumulator in Info.Block and checking if it has no uses. + // TODO : Swithing off this optimization for nonSSA context since Accumulator will + // have a use at the end of Info.Block : Set Accumumlator to 0 (since Info.Block is Ri) + // Will implement a nonSSA variant for the same. + MachineInstr *mergeInstr = MRI.getVRegDef(Info.Merged); - dbgs() << "\tmergeInstr:"; + LLVM_DEBUG(dbgs() << "\tmergeInstr:"); mergeInstr->dump(); - dbgs() << "\n"; + LLVM_DEBUG(dbgs() << "\n"); if (mergeInstr->getOpcode() == AMDGPU::COPY && mergeInstr->getOperand(1).getReg().isVirtual()) { assert(MRI.use_empty(Info.Merged)); Info.Merged = mergeInstr->getOperand(1).getReg(); - dbgs() << "\tset Merged:" << printReg(Info.Merged , MRI.getTargetRegisterInfo(), 0, &MRI) << " for block " << Info.Block->name() << "\n"; - dbgs() << "\tErase mergeInstr\n"; + LLVM_DEBUG(dbgs() << "\tset Merged:" << printReg(Info.Merged , MRI.getTargetRegisterInfo(), 0, &MRI) << " for block " << Info.Block->name() << "\n"); + LLVM_DEBUG(dbgs() << "\tErase mergeInstr\n"); mergeInstr->eraseFromParent(); } - } + }*/ } Processed = true; - dbgs() << "GCNLaneMaskUpdater::process() ends\n"; + LLVM_DEBUG(dbgs() << "GCNLaneMaskUpdater::process() ends\n"); } diff --git a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h index 2903f93fd98e1..ed46f2f6c6708 100644 --- a/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h +++ b/llvm/lib/Target/AMDGPU/GCNLaneMaskUtils.h @@ -23,6 +23,8 @@ #include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/Support/Debug.h" +#define DEBUG_TYPE "gcn-lane-mask-utils" + namespace llvm { class GCNLaneMaskAnalysis; @@ -44,14 +46,14 @@ class GCNLaneMaskUtils { const AMDGPU::LaneMaskConstants &getLaneMaskConsts() const { return LMC; } bool maybeLaneMask(Register Reg) const; - bool isConstantLaneMask(Register Reg, bool &Val) const; + bool isConstantLaneMask(Register Reg, bool &Val, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; Register createLaneMaskReg() const; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg, GCNLaneMaskAnalysis *LMA = nullptr, - bool Accumulating = false) const; + bool isPrevZeroReg = false) const; }; /// Lazy analyses of lane masks. @@ -64,7 +66,8 @@ class GCNLaneMaskAnalysis { public: GCNLaneMaskAnalysis(MachineFunction &MF) : LMU(MF) {} - bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + bool isSubsetOfExec(Register Reg, MachineBasicBlock &UseBlock, + MachineBasicBlock::iterator I, unsigned RemainingDepth = 5); }; @@ -106,9 +109,7 @@ class GCNLaneMaskUpdater { private: GCNLaneMaskUtils LMU; GCNLaneMaskAnalysis *LMA = nullptr; - MachineSSAUpdater SSAUpdater; MachineRegisterInfo &MRI; - bool Accumulating = false; bool Processed = false; @@ -116,19 +117,17 @@ class GCNLaneMaskUpdater { MachineBasicBlock *Block; unsigned Flags = 0; // ResetFlags Register Value; - Register Merged; explicit BlockInfo(MachineBasicBlock *Block) : Block(Block) {} void dump(MachineRegisterInfo &MRI) { - dbgs() << "BlockInfo{"; - dbgs() << " Block:" << Block->name() << ","; - dbgs() << " Value:" << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ","; - dbgs() << " Merged:" << printReg(Merged, MRI.getTargetRegisterInfo(), 0, &MRI) << ","; - dbgs() << " Flags:"; - if(Flags & ResetAtEnd) dbgs() << "ResetAtEnd,"; - if(Flags & ResetInMiddle) dbgs() << "ResetInMiddle,"; - dbgs() << "}\n"; + LLVM_DEBUG(dbgs() << "BlockInfo{"); + LLVM_DEBUG(dbgs() << " Block:" << Block->name() << ","); + LLVM_DEBUG(dbgs() << " Value:" << printReg(Value, MRI.getTargetRegisterInfo(), 0, &MRI) << ","); + LLVM_DEBUG(dbgs() << " Flags:"); + if(Flags & ResetAtEnd) LLVM_DEBUG(dbgs() << "ResetAtEnd,"); + if(Flags & ResetInMiddle) LLVM_DEBUG(dbgs() << "ResetInMiddle,"); + LLVM_DEBUG(dbgs() << "}\n"); } }; @@ -136,24 +135,24 @@ class GCNLaneMaskUpdater { Register ZeroReg; DenseSet PotentiallyDead; - + DenseMap> AccumulatorResetBlocks; public: - GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), SSAUpdater(MF), MRI(MF.getRegInfo()) {} + Register Accumulator; + + GCNLaneMaskUpdater(MachineFunction &MF) : LMU(MF), MRI(MF.getRegInfo()) {} void setLaneMaskAnalysis(GCNLaneMaskAnalysis *Analysis) { LMA = Analysis; } - void init(Register Reg); + void init(); void cleanup(); - void setAccumulating(bool Val) { Accumulating = Val; } - void addReset(MachineBasicBlock &Block, ResetFlags Flags); void addAvailable(MachineBasicBlock &Block, Register Value); Register getValueInMiddleOfBlock(MachineBasicBlock &Block); Register getValueAtEndOfBlock(MachineBasicBlock &Block); Register getValueAfterMerge(MachineBasicBlock &Block); - + void insertAccumulatorResets(); private: void process(); SmallVectorImpl::iterator findBlockInfo(MachineBasicBlock &Block); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 10af38c637a39..4f8fedf3e02cf 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4182,6 +4182,20 @@ const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const { : &AMDGPU::VReg_64RegClass; } +MachineBasicBlock::iterator SIRegisterInfo::getDomVRegDefInBasicBlock( + Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const TargetRegisterInfo *TRI) { + if (I == MBB.begin()) + return MBB.end(); + // Iterate backwards from I (exclusive) to the beginning of the basic block + do { + --I; + if (I->definesRegister(Reg, TRI)) + return I; + } while (I != MBB.begin()); + return MBB.end(); +} + // Find reaching register definition MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index bbb32397bc5a5..cf4a2945393ed 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -417,6 +417,15 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { MachineRegisterInfo &MRI, LiveIntervals *LIS) const; + /// getDomVRegDefInBasicBlock - Return the last machine instr that defines + /// the specified virtual register in the basic block, searching backwards + /// from instruction I (inclusive). Returns MBB.end() if no definition is + /// found. + static MachineBasicBlock::iterator + getDomVRegDefInBasicBlock(Register Reg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const TargetRegisterInfo *TRI); + const uint32_t *getAllVGPRRegMask() const; const uint32_t *getAllAGPRRegMask() const; const uint32_t *getAllVectorRegMask() const;