Skip to content

Commit b2d31cb

Browse files
committed
Reland "AMDGPU/PromoteAlloca: Simplify how deferred loads work (#170510)"
The second pass of promotion to vector can be quite simple. Reflect that simplicity in the code for better maintainability. v2: - don't put placeholders into the SSAUpdater, and add a test that shows the problem commit-id:d6d2255a
1 parent 762a171 commit b2d31cb

File tree

2 files changed

+69
-46
lines changed

2 files changed

+69
-46
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 36 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -502,27 +502,14 @@ static Value *promoteAllocaUserToVector(
502502
Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
503503
unsigned VecStoreSize, unsigned ElementSize,
504504
DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
505-
std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
506-
SmallVectorImpl<LoadInst *> &DeferredLoads) {
505+
std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx,
506+
function_ref<Value *()> GetCurVal) {
507507
// Note: we use InstSimplifyFolder because it can leverage the DataLayout
508508
// to do more folding, especially in the case of vector splats.
509509
IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
510510
InstSimplifyFolder(DL));
511511
Builder.SetInsertPoint(Inst);
512512

513-
const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
514-
if (CurVal)
515-
return CurVal;
516-
517-
// If the current value is not known, insert a dummy load and lower it on
518-
// the second pass.
519-
LoadInst *Dummy =
520-
Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
521-
"promotealloca.dummyload");
522-
DeferredLoads.push_back(Dummy);
523-
return Dummy;
524-
};
525-
526513
const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
527514
Type *PtrTy) -> Value * {
528515
assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
@@ -542,12 +529,7 @@ static Value *promoteAllocaUserToVector(
542529

543530
switch (Inst->getOpcode()) {
544531
case Instruction::Load: {
545-
// Loads can only be lowered if the value is known.
546-
if (!CurVal) {
547-
DeferredLoads.push_back(cast<LoadInst>(Inst));
548-
return nullptr;
549-
}
550-
532+
Value *CurVal = GetCurVal();
551533
Value *Index = calculateVectorIndex(
552534
cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
553535

@@ -637,7 +619,7 @@ static Value *promoteAllocaUserToVector(
637619

638620
Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
639621

640-
Value *CurVec = GetOrLoadCurrentVectorValue();
622+
Value *CurVec = GetCurVal();
641623
for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
642624
K < NumElts; ++K) {
643625
Value *CurIdx =
@@ -650,8 +632,7 @@ static Value *promoteAllocaUserToVector(
650632

651633
if (Val->getType() != VecEltTy)
652634
Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
653-
return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
654-
Index);
635+
return Builder.CreateInsertElement(GetCurVal(), Val, Index);
655636
}
656637
case Instruction::Call: {
657638
if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
@@ -673,7 +654,7 @@ static Value *promoteAllocaUserToVector(
673654
}
674655
}
675656

676-
return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
657+
return Builder.CreateShuffleVector(GetCurVal(), Mask);
677658
}
678659

679660
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -1038,37 +1019,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
10381019

10391020
Updater.AddAvailableValue(EntryBB, AllocaInitValue);
10401021

1041-
// First handle the initial worklist.
1042-
SmallVector<LoadInst *, 4> DeferredLoads;
1022+
// First handle the initial worklist, in basic block order.
1023+
//
1024+
// Insert a placeholder whenever we need the vector value at the top of a
1025+
// basic block.
1026+
SmallVector<Instruction *> Placeholders;
10431027
forEachWorkListItem(WorkList, [&](Instruction *I) {
10441028
BasicBlock *BB = I->getParent();
1045-
// On the first pass, we only take values that are trivially known, i.e.
1046-
// where AddAvailableValue was already called in this block.
1047-
Value *Result = promoteAllocaUserToVector(
1048-
I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1049-
Updater.FindValueForBlock(BB), DeferredLoads);
1029+
auto GetCurVal = [&]() -> Value * {
1030+
if (Value *CurVal = Updater.FindValueForBlock(BB))
1031+
return CurVal;
1032+
1033+
if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1034+
return Placeholders.back();
1035+
1036+
// If the current value in the basic block is not yet known, insert a
1037+
// placeholder that we will replace later.
1038+
IRBuilder<> Builder(I);
1039+
auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1040+
PoisonValue::get(VectorTy), "promotealloca.placeholder"));
1041+
Placeholders.push_back(Placeholder);
1042+
return Placeholders.back();
1043+
};
1044+
1045+
Value *Result =
1046+
promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize,
1047+
TransferInfo, GEPVectorIdx, GetCurVal);
10501048
if (Result)
10511049
Updater.AddAvailableValue(BB, Result);
10521050
});
10531051

1054-
// Then handle deferred loads.
1055-
forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
1056-
SmallVector<LoadInst *, 0> NewDLs;
1057-
BasicBlock *BB = I->getParent();
1058-
// On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1059-
// get a value, inserting PHIs as needed.
1060-
Value *Result = promoteAllocaUserToVector(
1061-
I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1062-
Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
1063-
if (Result)
1064-
Updater.AddAvailableValue(BB, Result);
1065-
assert(NewDLs.empty() && "No more deferred loads should be queued!");
1066-
});
1052+
// Now fixup the placeholders.
1053+
for (Instruction *Placeholder : Placeholders) {
1054+
Placeholder->replaceAllUsesWith(
1055+
Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
1056+
Placeholder->eraseFromParent();
1057+
}
10671058

10681059
// Delete all instructions. On the first pass, new dummy loads may have been
10691060
// added so we need to collect them too.
10701061
DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1071-
InstsToDelete.insert_range(DeferredLoads);
10721062
for (Instruction *I : InstsToDelete) {
10731063
assert(I->use_empty());
10741064
I->eraseFromParent();
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
3+
4+
; Checks that certain CFGs are handled correctly.
5+
6+
define amdgpu_kernel void @chain(i32 %init, i32 %i.1, i32 %i.2) {
7+
; CHECK-LABEL: @chain(
8+
; CHECK-NEXT: entry:
9+
; CHECK-NEXT: [[A:%.*]] = freeze <4 x i8> poison
10+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[INIT:%.*]] to <4 x i8>
11+
; CHECK-NEXT: br label [[BB1:%.*]]
12+
; CHECK: bb1:
13+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[TMP0]], i32 [[I_1:%.*]]
14+
; CHECK-NEXT: br label [[BB2:%.*]]
15+
; CHECK: bb2:
16+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 [[I_2:%.*]]
17+
; CHECK-NEXT: ret void
18+
;
19+
entry:
20+
%a = alloca [4 x i8], align 4, addrspace(5)
21+
store i32 %init, ptr addrspace(5) %a
22+
br label %bb1
23+
24+
bb1:
25+
%p.1 = getelementptr i8, ptr addrspace(5) %a, i32 %i.1
26+
%x.1 = load i8, ptr addrspace(5) %p.1
27+
br label %bb2
28+
29+
bb2:
30+
%p.2 = getelementptr i8, ptr addrspace(5) %a, i32 %i.2
31+
%x.2 = load i8, ptr addrspace(5) %p.2
32+
ret void
33+
}

0 commit comments

Comments
 (0)