Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VPlan] Use ResumePhi to create reduction resume phis. #110004

Merged
merged 12 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 67 additions & 61 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7562,67 +7562,62 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) {
}
}

// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
// create a merge phi node for it.
static void createAndCollectMergePhiForReduction(
VPInstruction *RedResult,
VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
bool VectorizingEpilogue) {
if (!RedResult ||
RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
// If \p R is a ComputeReductionResult when vectorizing the epilog loop,
// fix the reduction's scalar PHI node by adding the incoming value from the
// main vector loop.
static void fixReductionScalarResumeWhenVectorizingEpilog(
VPRecipeBase *R, VPTransformState &State, Loop *OrigLoop,
BasicBlock *LoopMiddleBlock) {
auto *EpiRedResult = dyn_cast<VPInstruction>(R);
if (!EpiRedResult ||
EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult)
return;

auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane()));
auto *ResumePhi =
dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
auto *Cmp = cast<ICmpInst>(PhiR->getStartValue()->getUnderlyingValue());
assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
ResumePhi = cast<PHINode>(Cmp->getOperand(0));
}
assert((!VectorizingEpilogue || ResumePhi) &&
"when vectorizing the epilogue loop, we need a resume phi from main "
"vector loop");

// TODO: bc.merge.rdx should not be created here, instead it should be
// modeled in VPlan.
auto *EpiRedHeaderPhi =
cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
const RecurrenceDescriptor &RdxDesc =
EpiRedHeaderPhi->getRecurrenceDescriptor();
Value *MainResumeValue =
EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
auto *Cmp = cast<ICmpInst>(MainResumeValue);
assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
"AnyOf expected to start with ICMP_NE");
assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&
"AnyOf expected to start by comparing main resume value to original "
"start value");
MainResumeValue = Cmp->getOperand(0);
}
PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);

// When fixing reductions in the epilogue loop we should already have
// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
// over the incoming values correctly.
using namespace VPlanPatternMatch;
auto IsResumePhi = [](VPUser *U) {
return match(
U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
};
assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
"ResumePhi must have a single user");
auto *EpiResumePhiVPI =
cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));
auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI, true));
BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent();

and avoid passing OrigLoop to fixReductionScalarResumeWhenVectorizingEpilog()?

// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: what is "backedge-taken check block", abbreviated in "BCBlock"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not entirely sure, but I think this meant the iterations checks.

auto *BCBlockPhi =
PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator()->getIterator());

// If we are fixing reductions in the epilogue loop then we should already
// have created a bc.merge.rdx Phi after the main vector body. Ensure that
// we carry over the incoming values correctly.
unsigned UpdateCnt = 0;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
unsigned UpdateCnt = 0;
bool Updated = false;

for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
if (Incoming == LoopMiddleBlock)
BCBlockPhi->addIncoming(FinalValue, Incoming);
else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
Incoming);
else
BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
if (is_contained(MainResumePhi->blocks(), Incoming)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we expect at-most one such Incoming, worth asserting?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

assert(EpiResumePhi->getIncomingValueForBlock(Incoming) ==
RdxDesc.getRecurrenceStartValue() &&
"Trying to reset unexpected value");
EpiResumePhi->setIncomingValueForBlock(
Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
UpdateCnt++;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
EpiResumePhi->setIncomingValueForBlock(
Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
UpdateCnt++;
assert(!Updated && "Should update at most 1 incoming value");
EpiResumePhi->setIncomingValueForBlock(
Incoming, MainResumePhi->getIncomingValueForBlock(Incoming));
Updated = true;

}
}

auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
// TODO: This fixup should instead be modeled in VPlan.
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
assert(UpdateCnt <= 1 && "Only should update at most 1 incoming value");
(void)UpdateCnt;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert(UpdateCnt <= 1 && "Only should update at most 1 incoming value");
(void)UpdateCnt;
(void)Updated;

}

DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
Expand Down Expand Up @@ -7713,11 +7708,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 2.5 Collect reduction resume values.
auto *ExitVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
for (VPRecipeBase &R : *ExitVPBB) {
createAndCollectMergePhiForReduction(
dyn_cast<VPInstruction>(&R), State, OrigLoop,
State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue);
}
if (VectorizingEpilogue)
for (VPRecipeBase &R : *ExitVPBB) {
fixReductionScalarResumeWhenVectorizingEpilog(
&R, State, OrigLoop, State.CFG.VPBB2IRBB[ExitVPBB]);
}

// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
Expand Down Expand Up @@ -9518,6 +9513,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
});
FinalReductionResult->insertBefore(*MiddleVPBB, IP);

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the above TODO also be taken care of - refrain from creating ComputeReductionResult for in-loop reductions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, but may be better as follow-up?

// Order is strict: if there are multiple successors, the first is the exit
// block, second is the scalar preheader.
VPBasicBlock *ScalarPHVPBB =
cast<VPBasicBlock>(MiddleVPBB->getSuccessors().back());
VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ResumePhi placed in the scalar preheader prepares for 2 values. When vectorizing an epilog loop, it should ideally prepare for 3: the ResumePhi after the main loop should feed the ResumePhi after the epilog loop (unless we're sure according to the trip count that if main and epilog are run they will take care of all iterations, w/o leaving any remainder for the scalar loop). But neither recipes nor blocks of the two VPlans can currently be connected.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, unfortunately there is no way to determine if the plan will be used for epilogue vectorization at this point I think. Once we model it explicitly in VPlan, the update should be connected here

{}, "bc.merge.rdx");
auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
Plan->addLiveOut(RedPhi, ResumePhiRecipe);

// Adjust AnyOf reductions; replace the reduction phi for the selected value
// with a boolean reduction phi node to check if the condition is true in
// any iteration. The final value is selected by the final
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: scalar.ph:
; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
; IF-EVL-INLOOP-NEXT: No successors
; IF-EVL-INLOOP-EMPTY:
; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
; IF-EVL-INLOOP-NEXT: }
;

Expand Down Expand Up @@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: scalar.ph:
; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
; NO-VP-OUTLOOP-NEXT: No successors
; NO-VP-OUTLOOP-EMPTY:
; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
; NO-VP-OUTLOOP-NEXT: }
;

Expand Down Expand Up @@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: scalar.ph:
; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start>
; NO-VP-INLOOP-NEXT: No successors
; NO-VP-INLOOP-EMPTY:
; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]>
; NO-VP-INLOOP-NEXT: }
;
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0>
; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]>
; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]>
; CHECK-NEXT: }
;
entry:
Expand Down
9 changes: 9 additions & 0 deletions llvm/test/Transforms/LoopVectorize/vplan-printing.ll
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
; CHECK-NEXT: }
;
entry:
Expand Down Expand Up @@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]>
; CHECK-NEXT: }
;
entry:
Expand Down Expand Up @@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) {
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00>
; CHECK-NEXT: No successors
; CHECK-EMPTY:
; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]>
; CHECK-NEXT:}

entry:
Expand Down