diff options
| author | Mingming Liu <mingmingl@google.com> | 2025-09-10 15:25:31 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-10 15:25:31 -0700 |
| commit | 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch) | |
| tree | 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Transforms/Scalar | |
| parent | 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff) | |
| parent | b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff) | |
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Transforms/Scalar')
| -rw-r--r-- | llvm/lib/Transforms/Scalar/ConstraintElimination.cpp | 116 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp | 42 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/GVN.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/InferAlignment.cpp | 35 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/JumpThreading.cpp | 11 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 176 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 76 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 27 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 3 |
10 files changed, 420 insertions, 71 deletions
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 1ddb8ae9518f..4acc3f2d8469 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -19,9 +19,11 @@ #include "llvm/Analysis/ConstraintSystem.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -170,10 +172,12 @@ struct State { DominatorTree &DT; LoopInfo &LI; ScalarEvolution &SE; + TargetLibraryInfo &TLI; SmallVector<FactOrCheck, 64> WorkList; - State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) - : DT(DT), LI(LI), SE(SE) {} + State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, + TargetLibraryInfo &TLI) + : DT(DT), LI(LI), SE(SE), TLI(TLI) {} /// Process block \p BB and add known facts to work-list. void addInfoFor(BasicBlock &BB); @@ -1109,10 +1113,54 @@ void State::addInfoForInductions(BasicBlock &BB) { } } +static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP, + uint64_t AccessSize, + CmpPredicate &Pred, Value *&A, + Value *&B, const DataLayout &DL, + const TargetLibraryInfo &TLI) { + auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL); + if (!Offset.NW.hasNoUnsignedWrap()) + return false; + + if (Offset.VariableOffsets.size() != 1) + return false; + + uint64_t BitWidth = Offset.ConstantOffset.getBitWidth(); + auto &[Index, Scale] = Offset.VariableOffsets.front(); + // Bail out on non-canonical GEPs. + if (Index->getType()->getScalarSizeInBits() != BitWidth) + return false; + + ObjectSizeOpts Opts; + // Workaround for gep inbounds, ptr null, idx. + Opts.NullIsUnknownSize = true; + // Be conservative since we are not clear on whether an out of bounds access + // to the padding is UB or not. + Opts.RoundToAlign = true; + std::optional<TypeSize> Size = + getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts); + if (!Size || Size->isScalable()) + return false; + + // Index * Scale + ConstOffset + AccessSize <= AllocSize + // With nuw flag, we know that the index addition doesn't have unsigned wrap. + // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid + // value for Index. + APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize, + /*isSigned=*/false, /*implicitTrunc=*/true) - + Offset.ConstantOffset) + .udiv(Scale); + Pred = ICmpInst::ICMP_ULE; + A = Index; + B = ConstantInt::get(Index->getType(), MaxIndex); + return true; +} + void State::addInfoFor(BasicBlock &BB) { addInfoForInductions(BB); + auto &DL = BB.getDataLayout(); - // True as long as long as the current instruction is guaranteed to execute. + // True as long as the current instruction is guaranteed to execute. bool GuaranteedToExecute = true; // Queue conditions and assumes. for (Instruction &I : BB) { @@ -1127,6 +1175,38 @@ void State::addInfoFor(BasicBlock &BB) { continue; } + auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) { + auto *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (!GEP) + return; + TypeSize AccessSize = DL.getTypeStoreSize(AccessType); + if (!AccessSize.isFixed()) + return; + if (GuaranteedToExecute) { + CmpPredicate Pred; + Value *A, *B; + if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(), + Pred, A, B, DL, TLI)) { + // The memory access is guaranteed to execute when BB is entered, + // hence the constraint holds on entry to BB. + WorkList.emplace_back(FactOrCheck::getConditionFact( + DT.getNode(I.getParent()), Pred, A, B)); + } + } else { + WorkList.emplace_back( + FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I)); + } + }; + + if (auto *LI = dyn_cast<LoadInst>(&I)) { + if (!LI->isVolatile()) + AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType()); + } + if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (!SI->isVolatile()) + AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType()); + } + auto *II = dyn_cast<IntrinsicInst>(&I); Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic; switch (ID) { @@ -1420,7 +1500,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A, LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n"); auto R = Info.getConstraintForSolving(Pred, A, B); - if (R.empty() || !R.isValid(Info)){ + if (R.empty() || !R.isValid(Info)) { LLVM_DEBUG(dbgs() << " failed to decompose condition\n"); return std::nullopt; } @@ -1785,12 +1865,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info, static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE, - OptimizationRemarkEmitter &ORE) { + OptimizationRemarkEmitter &ORE, + TargetLibraryInfo &TLI) { bool Changed = false; DT.updateDFSNumbers(); SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args())); ConstraintInfo Info(F.getDataLayout(), FunctionArgs); - State S(DT, LI, SE); + State S(DT, LI, SE, TLI); std::unique_ptr<Module> ReproducerModule( DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr); @@ -1960,6 +2041,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI, } continue; } + + auto &DL = F.getDataLayout(); + auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) { + CmpPredicate Pred; + Value *A, *B; + if (getConstraintFromMemoryAccess( + *cast<GetElementPtrInst>(Ptr), + DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL, + TLI)) + AddFact(Pred, A, B); + }; + + if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) { + AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType()); + continue; + } + if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) { + AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType()); + continue; + } } Value *A = nullptr, *B = nullptr; @@ -2018,7 +2119,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F, auto &LI = AM.getResult<LoopAnalysis>(F); auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); - if (!eliminateConstraints(F, DT, LI, SE, ORE)) + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI)) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 434b55868c99..944b253e0f5e 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -521,7 +521,7 @@ private: Instruction *SIUse = dyn_cast<Instruction>(SI->user_back()); // The use of the select inst should be either a phi or another select. - if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse))) + if (!SIUse || !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse))) return false; BasicBlock *SIBB = SI->getParent(); @@ -581,15 +581,17 @@ struct AllSwitchPaths { VisitedBlocks VB; // Get paths from the determinator BBs to SwitchPhiDefBB std::vector<ThreadingPath> PathsToPhiDef = - getPathsFromStateDefMap(StateDef, SwitchPhi, VB); - if (SwitchPhiDefBB == SwitchBlock) { + getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths); + if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) { TPaths = std::move(PathsToPhiDef); return; } + assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty()); + auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); // Find and append paths from SwitchPhiDefBB to SwitchBlock. PathsType PathsToSwitchBB = - paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1); + paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit); if (PathsToSwitchBB.empty()) return; @@ -610,13 +612,16 @@ private: typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, PHINode *Phi, - VisitedBlocks &VB) { + VisitedBlocks &VB, + unsigned PathsLimit) { std::vector<ThreadingPath> Res; auto *PhiBB = Phi->getParent(); VB.insert(PhiBB); VisitedBlocks UniqueBlocks; for (auto *IncomingBB : Phi->blocks()) { + if (Res.size() >= PathsLimit) + break; if (!UniqueBlocks.insert(IncomingBB).second) continue; if (!SwitchOuterLoop->contains(IncomingBB)) @@ -652,8 +657,9 @@ private: // Direct predecessor, just add to the path. if (IncomingPhiDefBB == IncomingBB) { - std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + assert(PathsLimit > Res.size()); + std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap( + StateDef, IncomingPhi, VB, PathsLimit - Res.size()); for (ThreadingPath &Path : PredPaths) { Path.push_back(PhiBB); Res.push_back(std::move(Path)); @@ -666,13 +672,17 @@ private: continue; PathsType IntermediatePaths; - IntermediatePaths = - paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1); + assert(PathsLimit > Res.size()); + auto InterPathLimit = PathsLimit - Res.size(); + IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB, + /* PathDepth = */ 1, InterPathLimit); if (IntermediatePaths.empty()) continue; + assert(InterPathLimit >= IntermediatePaths.size()); + auto PredPathLimit = InterPathLimit / IntermediatePaths.size(); std::vector<ThreadingPath> PredPaths = - getPathsFromStateDefMap(StateDef, IncomingPhi, VB); + getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit); for (const ThreadingPath &Path : PredPaths) { for (const PathType &IPath : IntermediatePaths) { ThreadingPath NewPath(Path); @@ -687,7 +697,7 @@ private: } PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, - unsigned PathDepth) { + unsigned PathDepth, unsigned PathsLimit) { PathsType Res; // Stop exploring paths after visiting MaxPathLength blocks @@ -714,6 +724,8 @@ private: // is used to prevent a duplicate path from being generated SmallPtrSet<BasicBlock *, 4> Successors; for (BasicBlock *Succ : successors(BB)) { + if (Res.size() >= PathsLimit) + break; if (!Successors.insert(Succ).second) continue; @@ -735,14 +747,12 @@ private: // coverage and compile time. if (LI->getLoopFor(Succ) != CurrLoop) continue; - - PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1); + assert(PathsLimit > Res.size()); + PathsType SuccPaths = + paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size()); for (PathType &Path : SuccPaths) { Path.push_front(BB); Res.push_back(Path); - if (Res.size() >= MaxNumPaths) { - return Res; - } } } // This block could now be visited again from a different predecessor. Note diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 4baa3b3eb824..26e17cc849bf 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2982,7 +2982,8 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, bool GVNPass::performScalarPRE(Instruction *CurInst) { if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() || isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() || - CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects()) + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + CurInst->getType()->isTokenLikeTy()) return false; // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index e9bf59c6850a..b60b15b6c3a2 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -35,8 +36,38 @@ static bool tryToImproveAlign( return true; } } - // TODO: Also handle memory intrinsics. - return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + + // TODO: Handle more memory intrinsics. + switch (II->getIntrinsicID()) { + case Intrinsic::masked_load: + case Intrinsic::masked_store: { + int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2; + Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load + ? II->getArgOperand(0) + : II->getArgOperand(1); + Type *Type = II->getIntrinsicID() == Intrinsic::masked_load + ? II->getType() + : II->getArgOperand(0)->getType(); + + Align OldAlign = + cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue(); + Align PrefAlign = DL.getPrefTypeAlign(Type); + Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign); + if (NewAlign <= OldAlign) + return false; + + Value *V = + ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value()); + II->setOperand(AlignOpIdx, V); + return true; + } + default: + return false; + } } bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index c2a737d8f9a4..c7d71eb5633e 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1437,9 +1437,18 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) { // AvailablePreds vector as we go so that all of the PHI entries for this // predecessor use the same bitcast. Value *&PredV = I->second; - if (PredV->getType() != LoadI->getType()) + if (PredV->getType() != LoadI->getType()) { PredV = CastInst::CreateBitOrPointerCast( PredV, LoadI->getType(), "", P->getTerminator()->getIterator()); + // The new cast is producing the value used to replace the load + // instruction, so uses the load's debug location. If P does not always + // branch to the load BB however then the debug location must be dropped, + // as it is hoisted past a conditional branch. + DebugLoc DL = P->getTerminator()->getNumSuccessors() == 1 + ? LoadI->getDebugLoc() + : DebugLoc::getDropped(); + cast<CastInst>(PredV)->setDebugLoc(DL); + } PN->addIncoming(PredV, I->first); } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 03b92d3338a9..0874b29ab7d2 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -39,6 +39,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/HashRecognize.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/MemoryLocation.h" @@ -143,6 +144,14 @@ static cl::opt<bool, true> cl::location(DisableLIRP::Wcslen), cl::init(false), cl::ReallyHidden); +bool DisableLIRP::HashRecognize; +static cl::opt<bool, true> + DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize", + cl::desc("Proceed with loop idiom recognize pass, " + "but do not optimize CRC loops."), + cl::location(DisableLIRP::HashRecognize), + cl::init(false), cl::ReallyHidden); + static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling " @@ -242,6 +251,7 @@ private: const SCEV *BECount); bool avoidLIRForMultiBlockLoop(bool IsMemset = false, bool IsLoopMemset = false); + bool optimizeCRCLoop(const PolynomialInfo &Info); /// @} /// \name Noncountable Loop Idiom Handling @@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); + std::optional<PolynomialInfo> HR; + LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, AR.MSSA, DL, ORE); if (!LIR.runOnLoop(&L)) @@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy) + if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || + HasMemcpy || !DisableLIRP::HashRecognize) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); } + + // Optimize a CRC loop if HashRecognize found one, provided we're not + // optimizing for size. + if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics) + if (auto Res = HashRecognize(*CurLoop, *SE).getResult()) + optimizeCRCLoop(*Res); + return MadeChange; } @@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, return false; } +bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) { + // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using + // carry-less multiplication instructions, which is more efficient than our + // Sarwate table-lookup optimization. Hence, until we're able to emit + // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom, + // disable the optimization for Hexagon. + Module &M = *CurLoop->getHeader()->getModule(); + Triple TT(M.getTargetTriple()); + if (TT.getArch() == Triple::hexagon) + return false; + + // First, create a new GlobalVariable corresponding to the + // Sarwate-lookup-table. + Type *CRCTy = Info.LHS->getType(); + unsigned CRCBW = CRCTy->getIntegerBitWidth(); + std::array<Constant *, 256> CRCConstants; + transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped), + CRCConstants.begin(), + [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); }); + Constant *ConstArray = + ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants); + GlobalVariable *GV = + new GlobalVariable(M, ConstArray->getType(), true, + GlobalValue::PrivateLinkage, ConstArray, ".crctable"); + + PHINode *IV = CurLoop->getCanonicalInductionVariable(); + SmallVector<PHINode *, 2> Cleanup; + + // Next, mark all PHIs for removal except IV. + { + for (PHINode &PN : CurLoop->getHeader()->phis()) { + if (&PN == IV) + continue; + PN.replaceAllUsesWith(PoisonValue::get(PN.getType())); + Cleanup.push_back(&PN); + } + } + + // Next, fix up the trip count. + { + unsigned NewBTC = (Info.TripCount / 8) - 1; + BasicBlock *LoopBlk = CurLoop->getLoopLatch(); + BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator()); + CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk + ? ICmpInst::Predicate::ICMP_NE + : ICmpInst::Predicate::ICMP_EQ; + Instruction *ExitCond = CurLoop->getLatchCmpInst(); + Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC); + IRBuilder<> Builder(ExitCond); + Value *NewExitCond = + Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond"); + ExitCond->replaceAllUsesWith(NewExitCond); + deleteDeadInstruction(ExitCond); + } + + // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all + // uses of ComputedValue. + // + // Little-endian: + // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)] + // Big-Endian: + // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)] + { + auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) { + Type *OpTy = Op->getType(); + unsigned OpBW = OpTy->getIntegerBitWidth(); + return OpBW > 8 + ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name) + : Op; + }; + auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op, + const Twine &Name) { + Type *OpTy = Op->getType(); + + // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to + // use the CRC's bitwidth as the reference for shifting right. + return LoByte(Builder, + CRCBW > 8 ? Builder.CreateLShr( + Op, ConstantInt::get(OpTy, CRCBW - 8), Name) + : Op, + Name + ".lo.byte"); + }; + + IRBuilder<> Builder(CurLoop->getHeader(), + CurLoop->getHeader()->getFirstNonPHIIt()); + + // Create the CRC PHI, and initialize its incoming value to the initial + // value of CRC. + PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc"); + CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader()); + + // CRC is now an evolving variable, initialized to the PHI. + Value *CRC = CRCPhi; + + // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte + // of LHSAux), if LHSAux is non-nullptr. + Value *Indexer = CRC; + if (Value *Data = Info.LHSAux) { + Type *DataTy = Data->getType(); + + // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we + // shift right by that amount, and take the lo-byte (in the little-endian + // case), or shift left by that amount, and take the hi-idx (in the + // big-endian case). + Value *IVBits = Builder.CreateZExtOrTrunc( + Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer"); + Value *DataIndexer = + Info.ByteOrderSwapped + ? Builder.CreateShl(Data, IVBits, "data.indexer") + : Builder.CreateLShr(Data, IVBits, "data.indexer"); + Indexer = Builder.CreateXor( + DataIndexer, + Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"), + "crc.data.indexer"); + } + + Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi") + : LoByte(Builder, Indexer, "indexer.lo"); + + // Always index into a GEP using the index type. + Indexer = Builder.CreateZExt( + Indexer, SE->getDataLayout().getIndexType(GV->getType()), + "indexer.ext"); + + // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC]. + Value *CRCTableGEP = + Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd"); + Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld"); + + // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of + // CRC-8. + Value *CRCNext = CRCTableLd; + if (CRCBW > 8) { + Value *CRCShift = Info.ByteOrderSwapped + ? Builder.CreateShl(CRC, 8, "crc.be.shift") + : Builder.CreateLShr(CRC, 8, "crc.le.shift"); + CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next"); + } + + // Connect the back-edge for the loop, and RAUW the ComputedValue. + CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch()); + Info.ComputedValue->replaceUsesOutsideBlock(CRCNext, + CurLoop->getLoopLatch()); + } + + // Cleanup. + { + for (PHINode *PN : Cleanup) + RecursivelyDeleteDeadPHINode(PN); + SE->forgetLoop(CurLoop); + } + return true; +} + bool LoopIdiomRecognize::runOnNoncountableLoop() { LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << CurLoop->getHeader()->getParent()->getName() diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f7d2258e1c28..2bda9d83236e 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze; UP.SCEVExpansionBudget = SCEVCheapExpansionBudget; UP.RuntimeUnrollMultiExit = false; + UP.AddAdditionalAccumulators = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, SE, UP, &ORE); @@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, ULO.Heart = getLoopConvergenceHeart(L); ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget; ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit; + ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators; LoopUnrollResult UnrollResult = UnrollLoop( L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); if (UnrollResult == LoopUnrollResult::Unmodified) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 8b9d06d7e443..8a5569743ab4 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -247,8 +247,8 @@ private: /// index I' according to UserChain produced by function "find". /// /// The building conceptually takes two steps: - /// 1) iteratively distribute s/zext towards the leaves of the expression tree - /// that computes I + /// 1) iteratively distribute sext/zext/trunc towards the leaves of the + /// expression tree that computes I /// 2) reassociate the expression tree to the form I' + C. /// /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute @@ -260,29 +260,30 @@ private: Value *rebuildWithoutConstOffset(); /// After the first step of rebuilding the GEP index without the constant - /// offset, distribute s/zext to the operands of all operators in UserChain. - /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) => + /// offset, distribute sext/zext/trunc to the operands of all operators in + /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) => /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))). /// /// The function also updates UserChain to point to new subexpressions after - /// distributing s/zext. e.g., the old UserChain of the above example is - /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), + /// distributing sext/zext/trunc. e.g., the old UserChain of the above example + /// is + /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)), /// and the new UserChain is - /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> - /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) + /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) -> + /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5)) /// /// \p ChainIndex The index to UserChain. ChainIndex is initially /// UserChain.size() - 1, and is decremented during /// the recursion. - Value *distributeExtsAndCloneChain(unsigned ChainIndex); + Value *distributeCastsAndCloneChain(unsigned ChainIndex); /// Reassociates the GEP index to the form I' + C and returns I'. Value *removeConstOffset(unsigned ChainIndex); - /// A helper function to apply ExtInsts, a list of s/zext, to value V. - /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function + /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value + /// V. e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function /// returns "sext i32 (zext i16 V to i32) to i64". - Value *applyExts(Value *V); + Value *applyCasts(Value *V); /// A helper function that returns whether we can trace into the operands /// of binary operator BO for a constant offset. @@ -307,8 +308,8 @@ private: SmallVector<User *, 8> UserChain; /// A data structure used in rebuildWithoutConstOffset. Contains all - /// sext/zext instructions along UserChain. - SmallVector<CastInst *, 16> ExtInsts; + /// sext/zext/trunc instructions along UserChain. + SmallVector<CastInst *, 16> CastInsts; /// Insertion position of cloned instructions. BasicBlock::iterator IP; @@ -491,7 +492,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, } Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1); - // Do not trace into "or" unless it is equivalent to "add". + // Do not trace into "or" unless it is equivalent to "add nuw nsw". // This is the case if the or's disjoint flag is set. if (BO->getOpcode() == Instruction::Or && !cast<PossiblyDisjointInst>(BO)->isDisjoint()) @@ -503,8 +504,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended, if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub) return false; - // In addition, tracing into BO requires that its surrounding s/zext (if - // any) is distributable to both operands. + // In addition, tracing into BO requires that its surrounding sext/zext/trunc + // (if any) is distributable to both operands. // // Suppose BO = A op B. // SignExtended | ZeroExtended | Distributable? @@ -628,11 +629,11 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended, return ConstantOffset; } -Value *ConstantOffsetExtractor::applyExts(Value *V) { +Value *ConstantOffsetExtractor::applyCasts(Value *V) { Value *Current = V; - // ExtInsts is built in the use-def order. Therefore, we apply them to V + // CastInsts is built in the use-def order. Therefore, we apply them to V // in the reversed order. - for (CastInst *I : llvm::reverse(ExtInsts)) { + for (CastInst *I : llvm::reverse(CastInsts)) { if (Constant *C = dyn_cast<Constant>(Current)) { // Try to constant fold the cast. Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL); @@ -640,24 +641,24 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) { continue; } - Instruction *Ext = I->clone(); - Ext->setOperand(0, Current); + Instruction *Cast = I->clone(); + Cast->setOperand(0, Current); // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so // we assume that it is ok to redistribute trunc over add/sub/or. But for // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc // nuw (add A, B))). To make such redistributions legal we drop all the // poison generating flags from cloned trunc instructions here. - if (isa<TruncInst>(Ext)) - Ext->dropPoisonGeneratingFlags(); - Ext->insertBefore(*IP->getParent(), IP); - Current = Ext; + if (isa<TruncInst>(Cast)) + Cast->dropPoisonGeneratingFlags(); + Cast->insertBefore(*IP->getParent(), IP); + Current = Cast; } return Current; } Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { - distributeExtsAndCloneChain(UserChain.size() - 1); - // Remove all nullptrs (used to be s/zext) from UserChain. + distributeCastsAndCloneChain(UserChain.size() - 1); + // Remove all nullptrs (used to be sext/zext/trunc) from UserChain. unsigned NewSize = 0; for (User *I : UserChain) { if (I != nullptr) { @@ -670,29 +671,29 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() { } Value * -ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) { +ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) { User *U = UserChain[ChainIndex]; if (ChainIndex == 0) { assert(isa<ConstantInt>(U)); - // If U is a ConstantInt, applyExts will return a ConstantInt as well. - return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U)); + // If U is a ConstantInt, applyCasts will return a ConstantInt as well. + return UserChain[ChainIndex] = cast<ConstantInt>(applyCasts(U)); } if (CastInst *Cast = dyn_cast<CastInst>(U)) { assert( (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) && "Only following instructions can be traced: sext, zext & trunc"); - ExtInsts.push_back(Cast); + CastInsts.push_back(Cast); UserChain[ChainIndex] = nullptr; - return distributeExtsAndCloneChain(ChainIndex - 1); + return distributeCastsAndCloneChain(ChainIndex - 1); } // Function find only trace into BinaryOperator and CastInst. BinaryOperator *BO = cast<BinaryOperator>(U); // OpNo = which operand of BO is UserChain[ChainIndex - 1] unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1); - Value *TheOther = applyExts(BO->getOperand(1 - OpNo)); - Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1); + Value *TheOther = applyCasts(BO->getOperand(1 - OpNo)); + Value *NextInChain = distributeCastsAndCloneChain(ChainIndex - 1); BinaryOperator *NewBO = nullptr; if (OpNo == 0) { @@ -713,7 +714,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) { BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]); assert((BO->use_empty() || BO->hasOneUse()) && - "distributeExtsAndCloneChain clones each BinaryOperator in " + "distributeCastsAndCloneChain clones each BinaryOperator in " "UserChain, so no one should be used more than " "once"); @@ -847,7 +848,8 @@ static bool allowsPreservingNUW(const User *U) { // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)" if (const TruncInst *TI = dyn_cast<TruncInst>(U)) return TI->hasNoUnsignedWrap(); - return isa<CastInst>(U) || isa<ConstantInt>(U); + assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User."); + return true; } Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP, diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 9b40fc03da6b..e4ba70d1bce1 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier( static cl::opt<int> UnswitchSiblingsToplevelDiv( "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden, cl::desc("Toplevel siblings divisor for cost multiplier.")); +static cl::opt<int> UnswitchParentBlocksDiv( + "unswitch-parent-blocks-div", cl::init(8), cl::Hidden, + cl::desc("Outer loop size divisor for cost multiplier.")); static cl::opt<int> UnswitchNumInitialUnscaledCandidates( "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden, cl::desc("Number of unswitch candidates that are ignored when calculating " @@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch -/// candidates available. Also accounting for the number of "sibling" loops with -/// the idea to account for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch +/// candidates available. Also consider the number of "sibling" loops with +/// the idea of accounting for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier( return 1; } + // Each invariant non-trivial condition, after being unswitched, is supposed + // to have its own specialized sibling loop (the invariant condition has been + // hoisted out of the child loop into a newly-cloned loop). When unswitching + // conditions in nested loops, the basic block size of the outer loop should + // not be altered. If such a size significantly increases across unswitching + // invocations, something may be wrong; so adjust the final cost taking this + // into account. auto *ParentL = L.getParentLoop(); + int ParentLoopSizeMultiplier = 1; + if (ParentL) + ParentLoopSizeMultiplier = + std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1); + int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() : std::distance(LI.begin(), LI.end())); // Count amount of clones that all the candidates might cause during @@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier( // at an upper bound. int CostMultiplier; if (ClonesPower > Log2_32(UnswitchThreshold) || - SiblingsMultiplier > UnswitchThreshold) + SiblingsMultiplier > UnswitchThreshold || + ParentLoopSizeMultiplier > UnswitchThreshold) CostMultiplier = UnswitchThreshold; else CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower), (int)UnswitchThreshold); LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier - << " (siblings " << SiblingsMultiplier << " * clones " + << " (siblings " << SiblingsMultiplier << " * parent size " + << ParentLoopSizeMultiplier << " * clones " << (1 << ClonesPower) << ")" << " for unswitch candidate: " << TI << "\n"); return CostMultiplier; diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index bb7dbc2980f5..e05625344ee2 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -997,7 +997,8 @@ void StructurizeCFG::simplifyHoistedPhis() { continue; OtherPhi->setIncomingValue(PoisonValBBIdx, V); - Phi->setIncomingValue(i, OtherV); + if (DT->dominates(OtherV, Phi)) + Phi->setIncomingValue(i, OtherV); } } } |
