Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"

Expand Down Expand Up @@ -60,6 +61,10 @@ static cl::opt<bool> UseCostHeur(
"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));

static cl::opt<bool> DisableMfmaChainOrderingDeps(
"amdgpu-disable-mfma-chain-order-deps", cl::init(false), cl::Hidden,
cl::desc("Enable artificial false dependencies between MFMA chains"));

// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.
enum class SchedGroupMask {
Expand Down Expand Up @@ -2342,6 +2347,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
// Add DAG edges that enforce SCHED_BARRIER ordering.
void addSchedBarrierEdges(SUnit &SU);

// Add artificial false-dependencies between MFMA consumers of adjacent
// DS_READ_B128 streams to enforce MFMA(newer) -> MFMA(older-last) ordering.
void addMfmaFalseDeps();

// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
// not be reordered accross the SCHED_BARRIER. This is used for the base
// SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
Expand Down Expand Up @@ -2585,6 +2594,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
}
}

if (!DisableMfmaChainOrderingDeps && ST.hasMAIInsts())
addMfmaFalseDeps();

if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
// PipelineSolver performs the mutation by adding the edges it
Expand Down Expand Up @@ -2681,6 +2693,88 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {

} // namespace

void IGroupLPDAGMutation::addMfmaFalseDeps() {
DenseMap<SUnit *, SUnit *> MFMAAncestor;
SmallVector<SUnit *, 10> MFMAChainLeaders;
DenseMap<SUnit *, SUnit *> MFMAChainNext;
for (auto &SU : DAG->SUnits) {
if (!TII->isMFMAorWMMA(*SU.getInstr()))
continue;

if (MFMAAncestor.contains(&SU))
continue;

SUnit *CurrMFMA = &SU;
MFMAAncestor[CurrMFMA] = CurrMFMA;
MFMAChainLeaders.push_back(&SU);
while (!CurrMFMA->Succs.empty()) {
// Count the number of successor MFMA/WMMA instructions of
// the current MFMA instruction.
SUnit *NextMFMA = nullptr;
unsigned MFMADataDepSuccCount = 0;
for (const auto &Succ : CurrMFMA->Succs) {
SUnit *SuccSU = Succ.getSUnit();
if (!SuccSU->isInstr() || !TII->isMFMAorWMMA(*SuccSU->getInstr()))
continue;

// Check if the successor is MFMA/WMMA and the edge is a data dependency
if (Succ.getKind() == SDep::Data) {
NextMFMA = SuccSU;
MFMADataDepSuccCount++;
}
}

// If the current MFMA instruction has more than one successor MFMA/WMMA
// instruction, we need to break the chain.
if (MFMADataDepSuccCount != 1) {
MFMAChainNext[CurrMFMA] = nullptr;
break;
}

// Add the current MFMA instruction to the MFMAAncestor map.
MFMAAncestor[CurrMFMA] = &SU;
MFMAChainNext[CurrMFMA] = NextMFMA;
CurrMFMA = NextMFMA;
}
}

// Compute the tail and length of each chain in a single loop.
auto GetTailAndLength = [&](SUnit *Leader) -> std::pair<SUnit *, unsigned> {
unsigned Length = 1;
SUnit *Curr = Leader;
while (MFMAChainNext.count(Curr)) {
if (!MFMAChainNext[Curr])
break;
Curr = MFMAChainNext[Curr];
++Length;
}
return {Curr, Length};
};

// Assert that all MFMA chains are ordered by NodeNum
// Add artificial false dependencies between MFMA chains if two given
// chains are at least 2 SUs long.
// Iterate over all pairs of contiguous MFMA chains and add artificial edges
// if chains are at least 2 SUs long.
for (size_t I = 0; I + 1 < MFMAChainLeaders.size(); ++I) {
SUnit *ChainLeaderA = MFMAChainLeaders[I];
SUnit *ChainLeaderB = MFMAChainLeaders[I + 1];

auto [TailA, LengthA] = GetTailAndLength(ChainLeaderA);
auto [TailB, LengthB] = GetTailAndLength(ChainLeaderB);

// Only add if both chains are at least two SUs long.
if (LengthA >= 2 && LengthB >= 2) {
// Add an artificial dependency edge from the tail of chain A to the
// leader of chain B.
LLVM_DEBUG(dbgs() << "Adding artificial dependency edge from "
<< TailA->NodeNum << " to " << ChainLeaderB->NodeNum
<< "\n");
DAG->addEdge(ChainLeaderB, SDep(TailA, SDep::Artificial));
}
}
}

/// \p Phase specifes whether or not this is a reentry into the
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
/// same scheduling region (e.g. pre and post-RA scheduling / multiple
Expand Down
Loading