@@ -174,6 +174,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
174174STATISTIC (LoopsVectorized, " Number of loops vectorized" );
175175STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
176176STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
177+ STATISTIC (CSAsVectorized,
178+ " Number of conditional scalar assignments vectorized" );
177179
178180static cl::opt<bool > EnableEpilogueVectorization (
179181 " enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -498,6 +500,10 @@ class InnerLoopVectorizer {
498500 // / Fix the vectorized code, taking care of header phi's, and more.
499501 void fixVectorizedLoop (VPTransformState &State);
500502
503+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
504+ // / loop with the extracted scalar from the vector loop for.
505+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
506+
501507 // Return true if any runtime check is added.
502508 bool areSafetyChecksAdded () { return AddedSafetyChecks; }
503509
@@ -2937,6 +2943,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29372943 TargetTransformInfo::TCK_RecipThroughput);
29382944}
29392945
2946+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2947+ for (const auto &CSA : Plan.getCSAStates ()) {
2948+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2949+ assert (VPDataUpdate &&
2950+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2951+ Value *V = VPDataUpdate->getUnderlyingValue ();
2952+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2953+ /* NeedsScalar=*/ true );
2954+ // Fix LCSSAPhis
2955+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2956+ for (User *U : V->users ())
2957+ if (auto *Phi = dyn_cast<PHINode>(U);
2958+ Phi && Phi->getParent () == LoopExitBlock)
2959+ ToFix.insert (Phi);
2960+ for (PHINode *Phi : ToFix)
2961+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2962+ }
2963+ }
2964+
29402965void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
29412966 // Fix widened non-induction PHIs by setting up the PHI operands.
29422967 if (EnableVPlanNativePath)
@@ -2971,6 +2996,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29712996 for (const auto &Entry : Legal->getInductionVars ())
29722997 fixupIVUsers (Entry.first , Entry.second ,
29732998 getOrCreateVectorTripCount (nullptr ), LoopMiddleBlock, State);
2999+ fixCSALiveOuts (State, Plan);
29743000 }
29753001
29763002 for (Instruction *PI : PredicatedInstructions)
@@ -4516,6 +4542,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45164542 case VPDef::VPEVLBasedIVPHISC:
45174543 case VPDef::VPPredInstPHISC:
45184544 case VPDef::VPBranchOnMaskSC:
4545+ case VPRecipeBase::VPCSADataUpdateSC:
4546+ case VPRecipeBase::VPCSAExtractScalarSC:
4547+ case VPRecipeBase::VPCSAHeaderPHISC:
45194548 continue ;
45204549 case VPDef::VPReductionSC:
45214550 case VPDef::VPActiveLaneMaskPHISC:
@@ -8701,9 +8730,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87018730 return Recipe;
87028731
87038732 VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8704- assert ((Legal->isReductionVariable (Phi) ||
8705- Legal->isFixedOrderRecurrence (Phi)) &&
8706- " can only widen reductions and fixed-order recurrences here" );
87078733 VPValue *StartV = Operands[0 ];
87088734 if (Legal->isReductionVariable (Phi)) {
87098735 const RecurrenceDescriptor &RdxDesc =
@@ -8713,12 +8739,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87138739 PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
87148740 CM.isInLoopReduction (Phi),
87158741 CM.useOrderedReductions (RdxDesc));
8716- } else {
8742+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
87178743 // TODO: Currently fixed-order recurrences are modeled as chains of
87188744 // first-order recurrences. If there are no users of the intermediate
87198745 // recurrences in the chain, the fixed order recurrence should be modeled
87208746 // directly, enabling more efficient codegen.
87218747 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8748+ } else if (Legal->isCSAPhi (Phi)) {
8749+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8750+ VPValue *InitData = State->getVPInitData ();
8751+ // When the VF=getFixed(1), InitData is just InitScalar.
8752+ if (!InitData)
8753+ InitData = State->getVPInitScalar ();
8754+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8755+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8756+ } else {
8757+ llvm_unreachable (
8758+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
87228759 }
87238760
87248761 PhisToFix.push_back (PhiRecipe);
@@ -8752,6 +8789,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87528789 make_range (Operands.begin (), Operands.end ()));
87538790
87548791 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8792+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8793+ return CSADescriptor::isCSASelect (CSA.second , SI);
8794+ });
8795+ if (CSADescIt != Legal->getCSAs ().end ()) {
8796+ PHINode *CSAPhi = CSADescIt->first ;
8797+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8798+ VPValue *VPDataPhi = State->getPhiRecipe ();
8799+ auto *R = new VPCSADataUpdateRecipe (
8800+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8801+ State->setDataUpdate (R);
8802+ return R;
8803+ }
8804+
87558805 return new VPWidenSelectRecipe (
87568806 *SI, make_range (Operands.begin (), Operands.end ()));
87578807 }
@@ -8764,6 +8814,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87648814 return tryToWiden (Instr, Operands, VPBB);
87658815}
87668816
8817+ // / Add CSA Recipes that can occur before each instruction in the input IR
8818+ // / is processed and introduced into VPlan.
8819+ static void
8820+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8821+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8822+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8823+ VPlan &Plan) {
8824+
8825+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8826+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8827+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8828+
8829+ for (const auto &CSA : CSAs) {
8830+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8831+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8832+
8833+ // Scalar VF builds the scalar version of the loop. In that case,
8834+ // no maintenence of mask nor extraction in middle block is needed.
8835+ if (IsScalarVF) {
8836+ VPCSAState *S = new VPCSAState (VPInitScalar);
8837+ Plan.addCSAState (CSA.first , S);
8838+ continue ;
8839+ }
8840+
8841+ auto *VPInitMask =
8842+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8843+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8844+ {VPInitScalar}, DL, " csa.init.data" );
8845+ PreheaderVPBB->appendRecipe (VPInitMask);
8846+ PreheaderVPBB->appendRecipe (VPInitData);
8847+
8848+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8849+ DL, " csa.mask.phi" );
8850+ HeaderVPBB->appendRecipe (VPMaskPhi);
8851+
8852+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8853+ Plan.addCSAState (CSA.first , S);
8854+ }
8855+ }
8856+
8857+ // / Add CSA Recipes that must occur after each instruction in the input IR
8858+ // / is processed and introduced into VPlan.
8859+ static void
8860+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8861+ const LoopVectorizationLegality::CSAList &CSAs,
8862+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8863+ VPlan &Plan) {
8864+ // Don't build CSA for VF=ElementCount::getFixed(1)
8865+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8866+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8867+ return ;
8868+
8869+ for (const auto &CSA : CSAs) {
8870+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8871+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8872+
8873+ assert (VPDataUpdate &&
8874+ " VPDataUpdate must have been introduced prior to postprocess" );
8875+ assert (CSA.second .getCond () &&
8876+ " CSADescriptor must know how to describe the condition" );
8877+ auto GetVPValue = [&](Value *I) {
8878+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8879+ };
8880+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8881+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8882+
8883+ // The CSA optimization wants to use a condition such that when it is
8884+ // true, a new value is assigned. However, it is possible that a true lane
8885+ // in WidenedCond corresponds to selection of the initial value instead.
8886+ // In that case, we must use the negation of WidenedCond.
8887+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8888+ VPValue *CondToUse = WidenedCond;
8889+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8890+ CSA.first ) {
8891+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8892+ VPNotCond->insertBefore (
8893+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8894+ CondToUse = VPNotCond;
8895+ }
8896+
8897+ auto *VPAnyActive = new VPInstruction (
8898+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8899+ VPAnyActive->insertBefore (
8900+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8901+
8902+ auto *VPMaskSel = new VPInstruction (
8903+ VPInstruction::CSAMaskSel,
8904+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8905+ VPMaskSel->insertAfter (VPAnyActive);
8906+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8907+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8908+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8909+
8910+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8911+
8912+ // Update CSAState with new recipes
8913+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8914+ CSAState->setVPAnyActive (VPAnyActive);
8915+ }
8916+ }
8917+
87678918void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
87688919 ElementCount MaxVF) {
87698920 assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8856,7 +9007,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
88569007// increments.
88579008static SetVector<VPIRInstruction *> collectUsersInExitBlocks (
88589009 Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8859- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
9010+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
9011+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
88609012 SetVector<VPIRInstruction *> ExitUsersToFix;
88619013 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks ()) {
88629014 BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock ();
@@ -8887,6 +9039,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
88879039 return P && Inductions.contains (P);
88889040 })))
88899041 continue ;
9042+ // Exit values for CSAs are computed and updated outside of VPlan and
9043+ // independent of induction recipes.
9044+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9045+ // live-outs.
9046+ if (isa<VPCSADataUpdateRecipe>(V) &&
9047+ (isa<Instruction>(IncomingValue) &&
9048+ any_of (IncomingValue->users (), [&CSAs](User *U) {
9049+ auto *P = dyn_cast<PHINode>(U);
9050+ return P && CSAs.contains (P);
9051+ })))
9052+ continue ;
88909053 ExitUsersToFix.insert (ExitIRI);
88919054 ExitIRI->addOperand (V);
88929055 }
@@ -9068,6 +9231,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90689231 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
90699232 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
90709233
9234+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9235+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9236+ Range, *Plan);
9237+
90719238 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
90729239
90739240 // ---------------------------------------------------------------------------
@@ -9185,6 +9352,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91859352 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
91869353 }
91879354
9355+ VPBasicBlock *MiddleVPBB =
9356+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9357+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9358+ Range, *Plan);
9359+
91889360 // After here, VPBB should not be used.
91899361 VPBB = nullptr ;
91909362
@@ -9195,8 +9367,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91959367 RecipeBuilder.fixHeaderPhis ();
91969368
91979369 addScalarResumePhis (RecipeBuilder, *Plan);
9198- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks (
9199- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9370+ SetVector<VPIRInstruction *> ExitUsersToFix =
9371+ collectUsersInExitBlocks (OrigLoop, RecipeBuilder, *Plan,
9372+ Legal->getInductionVars (), Legal->getCSAs ());
92009373 addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
92019374 addUsersInExitBlocks (*Plan, ExitUsersToFix);
92029375 // ---------------------------------------------------------------------------
@@ -10256,6 +10429,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1025610429 auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
1025710430 *BestMainPlan, MainILV, DT, false );
1025810431 ++LoopsVectorized;
10432+ CSAsVectorized += LVL.getCSAs ().size ();
1025910433
1026010434 // Second pass vectorizes the epilogue and adjusts the control flow
1026110435 // edges from the first pass.
@@ -10351,6 +10525,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1035110525 PSI, Checks, BestPlan);
1035210526 LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1035310527 ++LoopsVectorized;
10528+ CSAsVectorized += LVL.getCSAs ().size ();
1035410529
1035510530 // Add metadata to disable runtime unrolling a scalar loop when there
1035610531 // are no runtime checks about strides and memory. A scalar loop that is
0 commit comments