Skip to content

Commit

Permalink
[VPlan] Introduce ResumePhi VPInstruction, use to create phi for FOR. (
Browse files Browse the repository at this point in the history
…#94760)

This patch introduces a new ResumePhi VPInstruction which creates a phi
in a leaf block of a VPlan. The first use is to create the phi node for
fixed-order recurrence resume values in the scalar preheader.

The VPInstruction takes 2 operands: 1) the incoming value from the
middle-block and a default value to be used for all other incoming
blocks.

In follow-up changes, it will also be used to create phis for reduction
and induction resume values.

Depends on #92651

PR: #94760
  • Loading branch information
fhahn authored Jul 11, 2024
1 parent 4502ea8 commit 9a5a873
Show file tree
Hide file tree
Showing 26 changed files with 343 additions and 273 deletions.
103 changes: 56 additions & 47 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -599,10 +599,6 @@ class InnerLoopVectorizer {
BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
VPlan &Plan, VPTransformState &State);

/// Create the phi node for the resume value of first order recurrences in the
/// scalar preheader and update the users in the scalar loop.
void fixFixedOrderRecurrence(VPLiveOut *LO, VPTransformState &State);

/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
Expand Down Expand Up @@ -3286,19 +3282,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
if (EnableVPlanNativePath)
fixNonInductionPHIs(Plan, State);

// At this point every instruction in the original loop is widened to a
// vector form. Note that fixing reduction phis, as well as extracting the
// exit and resume values for fixed-order recurrences are already modeled in
// VPlan. All that remains to do here is to create a phi in the scalar
// pre-header for each fixed-order recurrence resume value.
// TODO: Also model creating phis in the scalar pre-header in VPlan.
for (const auto &[_, LO] : to_vector(Plan.getLiveOuts())) {
if (!Legal->isFixedOrderRecurrence(LO->getPhi()))
continue;
fixFixedOrderRecurrence(LO, State);
Plan.removeLiveOut(LO->getPhi());
}

// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);
PSE.getSE()->forgetBlockAndLoopDispositions();
Expand Down Expand Up @@ -3335,10 +3318,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VectorLoop->getHeader(), Plan, State);
}

// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
// in the exit block, so update the builder.
State.Builder.SetInsertPoint(State.CFG.ExitBB,
State.CFG.ExitBB->getFirstNonPHIIt());
// Fix live-out phis not already fixed earlier.
for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);

Expand Down Expand Up @@ -3366,32 +3346,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
VF.getKnownMinValue() * UF);
}

void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO,
VPTransformState &State) {
// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
VPValue *VPExtract = LO->getOperand(0);
using namespace llvm::VPlanPatternMatch;
assert(match(VPExtract, m_VPInstruction<VPInstruction::ExtractFromEnd>(
m_VPValue(), m_VPValue())) &&
"FOR LiveOut expects to use an extract from end.");
Value *ResumeScalarFOR = State.get(VPExtract, UF - 1, true);

// Fix the initial value of the original recurrence in the scalar loop.
PHINode *ScalarHeaderPhi = LO->getPhi();
auto *InitScalarFOR =
ScalarHeaderPhi->getIncomingValueForBlock(LoopScalarPreHeader);
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
auto *ScalarPreheaderPhi =
Builder.CreatePHI(ScalarHeaderPhi->getType(), 2, "scalar.recur.init");
for (auto *BB : predecessors(LoopScalarPreHeader)) {
auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR;
ScalarPreheaderPhi->addIncoming(Incoming, BB);
}
ScalarHeaderPhi->setIncomingValueForBlock(LoopScalarPreHeader,
ScalarPreheaderPhi);
}

void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
Expand Down Expand Up @@ -8798,6 +8752,59 @@ static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
}
}

/// Feed a resume value for every FOR from the vector loop to the scalar loop,
/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
/// latter and corresponds to the scalar header.
static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

// Start by finding out if middle block branches to scalar preheader, which is
// not a VPIRBasicBlock, unlike Exit block - the other possible successor of
// middle block.
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
VPBasicBlock *ScalarPHVPBB = nullptr;
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
if (isa<VPIRBasicBlock>(Succ))
continue;
assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
ScalarPHVPBB = cast<VPBasicBlock>(Succ);
}
if (!ScalarPHVPBB)
return;

VPBuilder ScalarPHBuilder(ScalarPHVPBB);
VPBuilder MiddleBuilder(MiddleVPBB);
// Reset insert point so new recipes are inserted before terminator and
// condition, if there is either the former or both.
if (auto *Terminator = MiddleVPBB->getTerminator()) {
auto *Condition = dyn_cast<VPInstruction>(Terminator->getOperand(0));
assert((!Condition || Condition->getParent() == MiddleVPBB) &&
"Condition expected in MiddleVPBB");
MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
}
VPValue *OneVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));

for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
if (!FOR)
continue;

// Extract the resume value and create a new VPLiveOut for it.
auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
{FOR->getBackedgeValue(), OneVPV},
{}, "vector.recur.extract");
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
"scalar.recur.init");
Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), ResumePhiRecipe);
}
}

VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

Expand Down Expand Up @@ -8967,6 +8974,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();

addLiveOutsForFirstOrderRecurrences(*Plan);

// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
Expand Down
25 changes: 18 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,10 @@ class VPBlockBase {
};

/// A value that is used outside the VPlan. The operand of the user needs to be
/// added to the associated LCSSA phi node.
/// added to the associated phi node. The incoming block from VPlan is
/// determined by where the VPValue is defined: if it is defined by a recipe
/// outside a region, its parent block is used, otherwise the middle block is
/// used.
class VPLiveOut : public VPUser {
PHINode *Phi;

Expand All @@ -709,11 +712,10 @@ class VPLiveOut : public VPUser {
return U->getVPUserID() == VPUser::VPUserID::LiveOut;
}

/// Fixup the wrapped LCSSA phi node in the unique exit block. This simply
/// means we need to add the appropriate incoming value from the middle
/// block as exiting edges from the scalar epilogue loop (if present) are
/// already in place, and we exit the vector loop exclusively to the middle
/// block.
/// Fix the wrapped phi node. This means adding an incoming value to exit
/// block phi's from the vector loop via middle block (values from scalar loop
/// already reach these phi's), and updating the value to scalar header phi's
/// from the scalar preheader.
void fixPhi(VPlan &Plan, VPTransformState &State);

/// Returns true if the VPLiveOut uses scalars of operand \p Op.
Expand Down Expand Up @@ -1238,6 +1240,11 @@ class VPInstruction : public VPRecipeWithIRFlags {
SLPStore,
ActiveLaneMask,
ExplicitVectorLength,
/// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
/// The first operand is the incoming value from the predecessor in VPlan,
/// the second operand is the incoming value for all other predecessors
/// (which are currently not modeled in VPlan).
ResumePhi,
CalculateTripCountMinusVF,
// Increment the canonical IV separately for each unrolled part.
CanonicalIVIncrementForPart,
Expand Down Expand Up @@ -1386,6 +1393,10 @@ class VPInstruction : public VPRecipeWithIRFlags {
/// Returns true if this VPInstruction produces a scalar value from a vector,
/// e.g. by performing a reduction or extracting a lane.
bool isVectorToScalar() const;

/// Returns true if this VPInstruction's operands are single scalars and the
/// result is also a single scalar.
bool isSingleScalar() const;
};

/// VPWidenRecipe is a recipe for producing a copy of vector type its
Expand Down Expand Up @@ -3766,7 +3777,7 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
return all_of(GEP->operands(), isUniformAfterVectorization);
if (auto *VPI = dyn_cast<VPInstruction>(Def))
return VPI->isVectorToScalar();
return VPI->isSingleScalar() || VPI->isVectorToScalar();
return false;
}

Expand Down
56 changes: 49 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,22 @@ void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
: VPLane::getLastLaneForVF(State.VF);
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
MiddleBB);
VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
// Values leaving the vector loop reach live out phi's in the exiting block
// via middle block.
auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
? MiddleVPBB
: ExitingVPBB;
BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
// Set insertion point in PredBB in case an extract needs to be generated.
// TODO: Model extracts explicitly.
State.Builder.SetInsertPoint(PredBB, PredBB->getFirstNonPHIIt());
Value *V = State.get(ExitValue, VPIteration(State.UF - 1, Lane));
if (Phi->getBasicBlockIndex(PredBB) != -1)
Phi->setIncomingValueForBlock(PredBB, V);
else
Phi->addIncoming(V, PredBB);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -338,7 +351,7 @@ bool VPInstruction::doesGeneratePerAllLanes() const {
bool VPInstruction::canGenerateScalarForFirstLane() const {
if (Instruction::isBinaryOp(getOpcode()))
return true;
if (isVectorToScalar())
if (isSingleScalar() || isVectorToScalar())
return true;
switch (Opcode) {
case Instruction::ICmp:
Expand Down Expand Up @@ -638,6 +651,27 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
return Builder.CreatePtrAdd(Ptr, Addend, Name);
}
case VPInstruction::ResumePhi: {
if (Part != 0)
return State.get(this, 0, /*IsScalar*/ true);
Value *IncomingFromVPlanPred =
State.get(getOperand(0), Part, /* IsScalar */ true);
Value *IncomingFromOtherPreds =
State.get(getOperand(1), Part, /* IsScalar */ true);
auto *NewPhi =
Builder.CreatePHI(IncomingFromOtherPreds->getType(), 2, Name);
BasicBlock *VPlanPred =
State.CFG
.VPBB2IRBB[cast<VPBasicBlock>(getParent()->getSinglePredecessor())];
NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred);
for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) {
assert(OtherPred != VPlanPred &&
"VPlan predecessors should not be connected yet");
NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred);
}
return NewPhi;
}

default:
llvm_unreachable("Unsupported opcode for instruction");
}
Expand All @@ -648,6 +682,10 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::ComputeReductionResult;
}

bool VPInstruction::isSingleScalar() const {
return getOpcode() == VPInstruction::ResumePhi;
}

#if !defined(NDEBUG)
bool VPInstruction::isFPMathOp() const {
// Inspired by FPMathOperator::classof. Notable differences are that we don't
Expand All @@ -668,9 +706,9 @@ void VPInstruction::execute(VPTransformState &State) {
if (hasFastMathFlags())
State.Builder.setFastMathFlags(getFastMathFlags());
State.setDebugLocFrom(getDebugLoc());
bool GeneratesPerFirstLaneOnly =
canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) ||
isVectorToScalar() || isSingleScalar());
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(this);
for (unsigned Part = 0; Part < State.UF; ++Part) {
Expand Down Expand Up @@ -722,6 +760,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
case VPInstruction::ResumePhi:
return true;
};
llvm_unreachable("switch should return");
Expand Down Expand Up @@ -774,6 +813,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
case VPInstruction::ResumePhi:
O << "resume-phi";
break;
case VPInstruction::ExplicitVectorLength:
O << "EXPLICIT-VECTOR-LENGTH";
break;
Expand Down
12 changes: 3 additions & 9 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,22 +937,16 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
Type *IntTy = Plan.getCanonicalIV()->getScalarType();

// Extract the penultimate value of the recurrence and update VPLiveOut
// users of the recurrence splice.
// users of the recurrence splice. Note that the extract of the final value
// used to resume in the scalar loop is created earlier during VPlan
// construction.
auto *Penultimate = cast<VPInstruction>(MiddleBuilder.createNaryOp(
VPInstruction::ExtractFromEnd,
{FOR->getBackedgeValue(),
Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
{}, "vector.recur.extract.for.phi"));
RecurSplice->replaceUsesWithIf(
Penultimate, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });

// Extract the resume value and create a new VPLiveOut for it.
auto *Resume = MiddleBuilder.createNaryOp(
VPInstruction::ExtractFromEnd,
{FOR->getBackedgeValue(),
Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 1))},
{}, "vector.recur.extract");
Plan.addLiveOut(cast<PHINode>(FOR->getUnderlyingInstr()), Resume);
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP24]], i32 3
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
Expand Down Expand Up @@ -160,10 +160,10 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE45]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT7:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT6]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE44]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT9]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup.loopexit:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
Expand Down
Loading

0 comments on commit 9a5a873

Please sign in to comment.