diff options
Diffstat (limited to 'llvm/lib/Transforms/Utils/LoopUnroll.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Utils/LoopUnroll.cpp | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 86b268de43cf..b18aceaa67d7 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden, #endif ); +static cl::opt<bool> UnrollAddParallelReductions( + "unroll-add-parallel-reductions", cl::init(false), cl::Hidden, + cl::desc("Allow unrolling to add parallel reduction phis.")); /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. @@ -660,6 +664,41 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, OrigPHINode.push_back(cast<PHINode>(I)); } + // Collect phi nodes for reductions for which we can introduce multiple + // parallel reduction phis and compute the final reduction result after the + // loop. This requires a single exit block after unrolling. This is ensured by + // restricting to single-block loops where the unrolled iterations are known + // to not exit. + DenseMap<PHINode *, RecurrenceDescriptor> Reductions; + bool CanAddAdditionalAccumulators = + (UnrollAddParallelReductions.getNumOccurrences() > 0 + ? UnrollAddParallelReductions + : ULO.AddAdditionalAccumulators) && + !CompletelyUnroll && L->getNumBlocks() == 1 && + (ULO.Runtime || + (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 && + ExitInfos[Header].BreakoutTrip == 0)))); + + // Limit parallelizing reductions to unroll counts of 4 or less for now. + // TODO: The number of parallel reductions should depend on the number of + // execution units. We also don't have to add a parallel reduction phi per + // unrolled iteration, but could for example add a parallel phi for every 2 + // unrolled iterations. + if (CanAddAdditionalAccumulators && ULO.Count <= 4) { + for (PHINode &Phi : Header->phis()) { + auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE); + if (!RdxDesc) + continue; + + // Only handle duplicate phis for a single reduction for now. + // TODO: Handle any number of reductions + if (!Reductions.empty()) + continue; + + Reductions[&Phi] = *RdxDesc; + } + } + std::vector<BasicBlock *> Headers; std::vector<BasicBlock *> Latches; Headers.push_back(Header); @@ -710,6 +749,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // latch. This is a reasonable default placement if we don't have block // frequencies, and if we do, well the layout will be adjusted later. auto BlockInsertPt = std::next(LatchBlock->getIterator()); + SmallVector<Instruction *> PartialReductions; for (unsigned It = 1; It != ULO.Count; ++It) { SmallVector<BasicBlock *, 8> NewBlocks; SmallDenseMap<const Loop *, Loop *, 4> NewLoops; @@ -733,6 +773,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, for (PHINode *OrigPHI : OrigPHINode) { PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); + + // Use cloned phis as parallel phis for partial reductions, which will + // get combined to the final reduction result after the loop. + if (Reductions.contains(OrigPHI)) { + // Collect partial reduction results. + if (PartialReductions.empty()) + PartialReductions.push_back(cast<Instruction>(InVal)); + PartialReductions.push_back(cast<Instruction>(VMap[InVal])); + + // Update the start value for the cloned phis to use the identity + // value for the reduction. + const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI]; + NewPHI->setIncomingValueForBlock( + L->getLoopPreheader(), + getRecurrenceIdentity(RdxDesc.getRecurrenceKind(), + OrigPHI->getType(), + RdxDesc.getFastMathFlags())); + + // Update NewPHI to use the cloned value for the iteration and move + // to header. + NewPHI->replaceUsesOfWith(InVal, VMap[InVal]); + NewPHI->moveBefore(OrigPHI->getIterator()); + continue; + } + if (Instruction *InValI = dyn_cast<Instruction>(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; @@ -832,6 +897,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); PN->eraseFromParent(); } else if (ULO.Count > 1) { + if (Reductions.contains(PN)) + continue; + Value *InVal = PN->removeIncomingValue(LatchBlock, false); // If this value was defined in the loop, take the value defined by the // last iteration of the loop. @@ -1010,6 +1078,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } + // If there are partial reductions, create code in the exit block to compute + // the final result and update users of the final result. + if (!PartialReductions.empty()) { + BasicBlock *ExitBlock = L->getExitBlock(); + assert(ExitBlock && + "Can only introduce parallel reduction phis with single exit block"); + assert(Reductions.size() == 1 && + "currently only a single reduction is supported"); + Value *FinalRdxValue = PartialReductions.back(); + Value *RdxResult = nullptr; + for (PHINode &Phi : ExitBlock->phis()) { + if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue) + continue; + if (!RdxResult) { + RdxResult = PartialReductions.front(); + IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt()); + RecurKind RK = Reductions.begin()->second.getRecurrenceKind(); + for (Instruction *RdxPart : drop_begin(PartialReductions)) { + RdxResult = Builder.CreateBinOp( + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), + RdxPart, RdxResult, "bin.rdx"); + } + NeedToFixLCSSA = true; + for (Instruction *RdxPart : PartialReductions) + RdxPart->dropPoisonGeneratingFlags(); + } + + Phi.replaceAllUsesWith(RdxResult); + continue; + } + } + if (DTUToUse) { // Apply updates to the DomTree. DT = &DTU.getDomTree(); @@ -1111,3 +1211,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { } return nullptr; } + +std::optional<RecurrenceDescriptor> +llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, + ScalarEvolution *SE) { + RecurrenceDescriptor RdxDesc; + if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc, + /*DemandedBits=*/nullptr, + /*AC=*/nullptr, /*DT=*/nullptr, SE)) + return std::nullopt; + RecurKind RK = RdxDesc.getRecurrenceKind(); + // Skip unsupported reductions. + // TODO: Handle additional reductions, including FP and min-max + // reductions. + if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) || + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || + RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || + RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) + return std::nullopt; + + if (RdxDesc.IntermediateStore) + return std::nullopt; + + // Don't unroll reductions with constant ops; those can be folded to a + // single induction update. + if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch())) + ->operands(), + IsaPred<Constant>)) + return std::nullopt; + + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || + !is_contained( + cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(), + &Phi)) + return std::nullopt; + + return RdxDesc; +} |
