summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Utils/LoopUnroll.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUnroll.cpp')
-rw-r--r--llvm/lib/Transforms/Utils/LoopUnroll.cpp138
1 files changed, 138 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 86b268de43cf..b18aceaa67d7 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -41,6 +41,7 @@
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
#endif
);
+static cl::opt<bool> UnrollAddParallelReductions(
+ "unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
+ cl::desc("Allow unrolling to add parallel reduction phis."));
/// Check if unrolling created a situation where we need to insert phi nodes to
/// preserve LCSSA form.
@@ -660,6 +664,41 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
OrigPHINode.push_back(cast<PHINode>(I));
}
+ // Collect phi nodes for reductions for which we can introduce multiple
+ // parallel reduction phis and compute the final reduction result after the
+ // loop. This requires a single exit block after unrolling. This is ensured by
+ // restricting to single-block loops where the unrolled iterations are known
+ // to not exit.
+ DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
+ bool CanAddAdditionalAccumulators =
+ (UnrollAddParallelReductions.getNumOccurrences() > 0
+ ? UnrollAddParallelReductions
+ : ULO.AddAdditionalAccumulators) &&
+ !CompletelyUnroll && L->getNumBlocks() == 1 &&
+ (ULO.Runtime ||
+ (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
+ ExitInfos[Header].BreakoutTrip == 0))));
+
+ // Limit parallelizing reductions to unroll counts of 4 or less for now.
+ // TODO: The number of parallel reductions should depend on the number of
+ // execution units. We also don't have to add a parallel reduction phi per
+ // unrolled iteration, but could for example add a parallel phi for every 2
+ // unrolled iterations.
+ if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
+ for (PHINode &Phi : Header->phis()) {
+ auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
+ if (!RdxDesc)
+ continue;
+
+ // Only handle duplicate phis for a single reduction for now.
+ // TODO: Handle any number of reductions
+ if (!Reductions.empty())
+ continue;
+
+ Reductions[&Phi] = *RdxDesc;
+ }
+ }
+
std::vector<BasicBlock *> Headers;
std::vector<BasicBlock *> Latches;
Headers.push_back(Header);
@@ -710,6 +749,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// latch. This is a reasonable default placement if we don't have block
// frequencies, and if we do, well the layout will be adjusted later.
auto BlockInsertPt = std::next(LatchBlock->getIterator());
+ SmallVector<Instruction *> PartialReductions;
for (unsigned It = 1; It != ULO.Count; ++It) {
SmallVector<BasicBlock *, 8> NewBlocks;
SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -733,6 +773,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
for (PHINode *OrigPHI : OrigPHINode) {
PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+
+ // Use cloned phis as parallel phis for partial reductions, which will
+ // get combined to the final reduction result after the loop.
+ if (Reductions.contains(OrigPHI)) {
+ // Collect partial reduction results.
+ if (PartialReductions.empty())
+ PartialReductions.push_back(cast<Instruction>(InVal));
+ PartialReductions.push_back(cast<Instruction>(VMap[InVal]));
+
+ // Update the start value for the cloned phis to use the identity
+ // value for the reduction.
+ const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
+ NewPHI->setIncomingValueForBlock(
+ L->getLoopPreheader(),
+ getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
+ OrigPHI->getType(),
+ RdxDesc.getFastMathFlags()));
+
+ // Update NewPHI to use the cloned value for the iteration and move
+ // to header.
+ NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
+ NewPHI->moveBefore(OrigPHI->getIterator());
+ continue;
+ }
+
if (Instruction *InValI = dyn_cast<Instruction>(InVal))
if (It > 1 && L->contains(InValI))
InVal = LastValueMap[InValI];
@@ -832,6 +897,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
PN->eraseFromParent();
} else if (ULO.Count > 1) {
+ if (Reductions.contains(PN))
+ continue;
+
Value *InVal = PN->removeIncomingValue(LatchBlock, false);
// If this value was defined in the loop, take the value defined by the
// last iteration of the loop.
@@ -1010,6 +1078,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
}
}
+ // If there are partial reductions, create code in the exit block to compute
+ // the final result and update users of the final result.
+ if (!PartialReductions.empty()) {
+ BasicBlock *ExitBlock = L->getExitBlock();
+ assert(ExitBlock &&
+ "Can only introduce parallel reduction phis with single exit block");
+ assert(Reductions.size() == 1 &&
+ "currently only a single reduction is supported");
+ Value *FinalRdxValue = PartialReductions.back();
+ Value *RdxResult = nullptr;
+ for (PHINode &Phi : ExitBlock->phis()) {
+ if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
+ continue;
+ if (!RdxResult) {
+ RdxResult = PartialReductions.front();
+ IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
+ RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
+ for (Instruction *RdxPart : drop_begin(PartialReductions)) {
+ RdxResult = Builder.CreateBinOp(
+ (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
+ RdxPart, RdxResult, "bin.rdx");
+ }
+ NeedToFixLCSSA = true;
+ for (Instruction *RdxPart : PartialReductions)
+ RdxPart->dropPoisonGeneratingFlags();
+ }
+
+ Phi.replaceAllUsesWith(RdxResult);
+ continue;
+ }
+ }
+
if (DTUToUse) {
// Apply updates to the DomTree.
DT = &DTU.getDomTree();
@@ -1111,3 +1211,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
}
return nullptr;
}
+
+std::optional<RecurrenceDescriptor>
+llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
+ ScalarEvolution *SE) {
+ RecurrenceDescriptor RdxDesc;
+ if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc,
+ /*DemandedBits=*/nullptr,
+ /*AC=*/nullptr, /*DT=*/nullptr, SE))
+ return std::nullopt;
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ // Skip unsupported reductions.
+ // TODO: Handle additional reductions, including FP and min-max
+ // reductions.
+ if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
+ return std::nullopt;
+
+ if (RdxDesc.IntermediateStore)
+ return std::nullopt;
+
+ // Don't unroll reductions with constant ops; those can be folded to a
+ // single induction update.
+ if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
+ ->operands(),
+ IsaPred<Constant>))
+ return std::nullopt;
+
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch ||
+ !is_contained(
+ cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
+ &Phi))
+ return std::nullopt;
+
+ return RdxDesc;
+}