Skip to content

Commit

Permalink
[VPlan] Use ResumePhi to create reduction resume phis. (#110004)
Browse files Browse the repository at this point in the history
Use VPInstruction::ResumePhi to create phi nodes for reduction resume
values in the scalar preheader, similar to how ResumePhis are used for
first-order recurrence resume values after 9a5a873.

This allows simplifying createAndCollectMergePhiForReduction to only
collect reduction resume phis when vectorizing epilogue loops and adding
extra incoming edges from the main vector loop. Updating phis for the
epilogue vector loops requires special attention, because additional
incoming values from the bypass blocks need to be added.

PR: #110004
  • Loading branch information
fhahn authored Oct 28, 2024
1 parent 66bbbf2 commit 0d0abb3
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 62 deletions.
130 changes: 68 additions & 62 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7562,67 +7562,62 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
}
}

// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
// create a merge phi node for it.
static void createAndCollectMergePhiForReduction(
VPInstruction *RedResult,
VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
bool VectorizingEpilogue) {
if (!RedResult ||
RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
// fix the reduction's scalar PHI node by adding the incoming value from the
// main vector loop.
static void fixReductionScalarResumeWhenVectorizingEpilog(
VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) {
auto *EpiRedResult = dyn_cast<VPInstruction>(R);
if (!EpiRedResult ||
EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
return;

auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane()));
auto *ResumePhi =
dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
ResumePhi = cast<PHINode>(Cmp->getOperand(0));
}
assert((!VectorizingEpilogue || ResumePhi) &&
"when vectorizing the epilogue loop, we need a resume phi from main "
"vector loop");

// TODO: bc.merge.rdx should not be created here, instead it should be
// modeled in VPlan.
BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
auto *BCBlockPhi =
PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator()->getIterator());

// If we are fixing reductions in the epilogue loop then we should already
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
// we carry over the incoming values correctly.
auto *EpiRedHeaderPhi =
cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
const RecurrenceDescriptor &RdxDesc =
EpiRedHeaderPhi->getRecurrenceDescriptor();
Value *MainResumeValue =
EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
auto *Cmp = cast<ICmpInst>(MainResumeValue);
assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
"AnyOf expected to start with ICMP_NE");
assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
"AnyOf expected to start by comparing main resume value to original "
"start value");
MainResumeValue = Cmp->getOperand(0);
}
PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);

// When fixing reductions in the epilogue loop we should already have
// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
// over the incoming values correctly.
using namespace VPlanPatternMatch;
auto IsResumePhi = [](VPUser *U) {
return match(
U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
};
assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
"ResumePhi must have a single user");
auto *EpiResumePhiVPI =
cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent();
bool Updated = false;
for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
if (Incoming == LoopMiddleBlock)
BCBlockPhi->addIncoming(FinalValue, Incoming);
else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
Incoming);
else
BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
if (is_contained(MainResumePhi->blocks(), Incoming)) {
assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
RdxDesc.getRecurrenceStartValue() &&
"Trying to reset unexpected value");
assert(!Updated && "Should update at most 1 incoming value");
EpiResumePhi->setIncomingValueForBlock(
Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
Updated = true;
}
}

auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
// TODO: This fixup should instead be modeled in VPlan.
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
assert(Updated && "Must update EpiResumePhi.");
(void)Updated;
}

DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
Expand Down Expand Up @@ -7713,11 +7708,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.5 Collect reduction resume values.
auto *ExitVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
for (VPRecipeBase &R : *ExitVPBB) {
createAndCollectMergePhiForReduction(
dyn_cast<VPInstruction>(&R), State, OrigLoop,
State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue);
}
if (VectorizingEpilogue)
for (VPRecipeBase &R : *ExitVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
&R, State, State.CFG.VPBB2IRBB[ExitVPBB]);
}

// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
Expand Down Expand Up @@ -9518,6 +9513,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
});
FinalReductionResult->insertBefore(*MiddleVPBB, IP);

// Order is strict: if there are multiple successors, the first is the exit
// block, second is the scalar preheader.
VPBasicBlock *ScalarPHVPBB =
cast<VPBasicBlock>(MiddleVPBB->getSuccessors().back());
VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
{}, "bc.merge.rdx");
auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
Plan->addLiveOut(RedPhi, ResumePhiRecipe);

// Adjust AnyOf reductions; replace the reduction phi for the selected value
// with a boolean reduction phi node to check if the condition is true in
// any iteration. The final value is selected by the final
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: scalar.ph:
; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
; IF-EVL-INLOOP-NEXT: }
;

Expand Down Expand Up @@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
; NO-VP-OUTLOOP-NEXT: }
;

Expand Down Expand Up @@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: scalar.ph:
; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
; NO-VP-INLOOP-NEXT: }
;
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]>
; CHECK-NEXT: }
;
entry:
Expand Down
9 changes: 9 additions & 0 deletions llvm/test/Transforms/LoopVectorize/vplan-printing.ll
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
; CHECK-NEXT: }
;
entry:
Expand Down Expand Up @@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
; CHECK-NEXT: }
;
entry:
Expand Down Expand Up @@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]>
; CHECK-NEXT:}

entry:
Expand Down

0 comments on commit 0d0abb3

Please sign in to comment.