Skip to content

Commit c80f16e

Browse files
committed
[LV] Always include middle block cost in isOutsideLoopWorkProfitable.
Always include the cost of the middle block in isOutsideLoopWorkProfitable. This addresses the TODO from #168949 and removes the temporary restriction. isOutsideLoopWorkProfitable already scales the cost outside loops according the expected trip counts. In practice this increases the minimum iteration threshold in a few cases. On a large IR corpus based on C/C++ workloads, ~50 out of 179450 vector loops have their thresholds increased slightly.
1 parent 8378a6f commit c80f16e

File tree

6 files changed

+16
-17
lines changed

6 files changed

+16
-17
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9329,13 +9329,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
93299329
// one exists.
93309330
TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
93319331

9332-
// If the expected trip count is less than the VF, the vector loop will only
9333-
// execute a single iteration. Then the middle block is executed the same
9334-
// number of times as the vector region.
9335-
// TODO: Extend logic to always account for the cost of the middle block.
9336-
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9337-
if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width))
9338-
TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
9332+
TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
93399333

93409334
// When interleaving only scalar and vector cost will be equal, which in turn
93419335
// would lead to a divide by 0. Fall back to hard threshold.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,10 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
11041104
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
11051105
}
11061106
case VPInstruction::ExtractLastLane: {
1107+
// TODO: ExtractLastLane for scalar VF is a no-op. Remove before ::execute.
1108+
if (VF.isScalar())
1109+
return 0;
1110+
11071111
// Add on the cost of extracting the element.
11081112
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
11091113
return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Instruction::ExtractElement,

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %
9696
; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
9797
; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t>
9898
; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l>
99-
; CHECK-NEXT: LV: Vectorization is possible but not beneficial.
99+
; CHECK: LV: Vectorization is possible but not beneficial.
100100
entry:
101101
br label %loop.header
102102

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
9494
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
9595
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
9696
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
97-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
97+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
9898
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
9999
; CHECK: vector.ph:
100100
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef reado
88
; CHECK: Calculating cost of runtime checks:
99
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop.
1010
; CHECK: Total cost of runtime checks: 4
11-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
11+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
1212
entry:
1313
br label %inner.loop
1414

@@ -34,7 +34,7 @@ define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonl
3434
; CHECK: Calculating cost of runtime checks:
3535
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
3636
; CHECK: Total cost of runtime checks: 3
37-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
37+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
3838
entry:
3939
br label %outer.loop
4040

@@ -71,7 +71,7 @@ define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef rea
7171
; CHECK: Calculating cost of runtime checks:
7272
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
7373
; CHECK: Total cost of runtime checks: 2
74-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
74+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
7575
entry:
7676
br label %outer.loop
7777

@@ -108,7 +108,7 @@ define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef re
108108
; CHECK: Calculating cost of runtime checks:
109109
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
110110
; CHECK: Total cost of runtime checks: 1
111-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
111+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
112112
entry:
113113
br label %outer.loop
114114

@@ -145,7 +145,7 @@ define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonl
145145
; CHECK: Calculating cost of runtime checks:
146146
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
147147
; CHECK: Total cost of runtime checks: 2
148-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
148+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
149149
entry:
150150
br label %outer.loop
151151

@@ -182,7 +182,7 @@ define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef re
182182
; CHECK: Calculating cost of runtime checks:
183183
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1
184184
; CHECK: Total cost of runtime checks: 1
185-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
185+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:16
186186
entry:
187187
br label %outer.loop
188188

@@ -219,7 +219,7 @@ define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr n
219219
; CHECK: Calculating cost of runtime checks:
220220
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2
221221
; CHECK: Total cost of runtime checks: 2
222-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
222+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:4
223223
entry:
224224
br label %outer.loop
225225

llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0
1616
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]]
1717
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
1818
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1
19-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]]
19+
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP3]], i32 6)
20+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[UMAX]]
2021
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
2122
; CHECK: vector.ph:
2223
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32()

0 commit comments

Comments
 (0)