summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Scalar
diff options
context:
space:
mode:
authorMingming Liu <mingmingl@google.com>2025-09-10 15:25:31 -0700
committerGitHub <noreply@github.com>2025-09-10 15:25:31 -0700
commit1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Transforms/Scalar
parent898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parentb8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Transforms/Scalar')
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp116
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp42
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp3
-rw-r--r--llvm/lib/Transforms/Scalar/InferAlignment.cpp35
-rw-r--r--llvm/lib/Transforms/Scalar/JumpThreading.cpp11
-rw-r--r--llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp176
-rw-r--r--llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp2
-rw-r--r--llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp76
-rw-r--r--llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp27
-rw-r--r--llvm/lib/Transforms/Scalar/StructurizeCFG.cpp3
10 files changed, 420 insertions, 71 deletions
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 1ddb8ae9518f..4acc3f2d8469 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -19,9 +19,11 @@
#include "llvm/Analysis/ConstraintSystem.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
@@ -170,10 +172,12 @@ struct State {
DominatorTree &DT;
LoopInfo &LI;
ScalarEvolution &SE;
+ TargetLibraryInfo &TLI;
SmallVector<FactOrCheck, 64> WorkList;
- State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE)
- : DT(DT), LI(LI), SE(SE) {}
+ State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE,
+ TargetLibraryInfo &TLI)
+ : DT(DT), LI(LI), SE(SE), TLI(TLI) {}
/// Process block \p BB and add known facts to work-list.
void addInfoFor(BasicBlock &BB);
@@ -1109,10 +1113,54 @@ void State::addInfoForInductions(BasicBlock &BB) {
}
}
+static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP,
+ uint64_t AccessSize,
+ CmpPredicate &Pred, Value *&A,
+ Value *&B, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL);
+ if (!Offset.NW.hasNoUnsignedWrap())
+ return false;
+
+ if (Offset.VariableOffsets.size() != 1)
+ return false;
+
+ uint64_t BitWidth = Offset.ConstantOffset.getBitWidth();
+ auto &[Index, Scale] = Offset.VariableOffsets.front();
+ // Bail out on non-canonical GEPs.
+ if (Index->getType()->getScalarSizeInBits() != BitWidth)
+ return false;
+
+ ObjectSizeOpts Opts;
+ // Workaround for gep inbounds, ptr null, idx.
+ Opts.NullIsUnknownSize = true;
+ // Be conservative since we are not clear on whether an out of bounds access
+ // to the padding is UB or not.
+ Opts.RoundToAlign = true;
+ std::optional<TypeSize> Size =
+ getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts);
+ if (!Size || Size->isScalable())
+ return false;
+
+ // Index * Scale + ConstOffset + AccessSize <= AllocSize
+ // With nuw flag, we know that the index addition doesn't have unsigned wrap.
+ // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid
+ // value for Index.
+ APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize,
+ /*isSigned=*/false, /*implicitTrunc=*/true) -
+ Offset.ConstantOffset)
+ .udiv(Scale);
+ Pred = ICmpInst::ICMP_ULE;
+ A = Index;
+ B = ConstantInt::get(Index->getType(), MaxIndex);
+ return true;
+}
+
void State::addInfoFor(BasicBlock &BB) {
addInfoForInductions(BB);
+ auto &DL = BB.getDataLayout();
- // True as long as long as the current instruction is guaranteed to execute.
+ // True as long as the current instruction is guaranteed to execute.
bool GuaranteedToExecute = true;
// Queue conditions and assumes.
for (Instruction &I : BB) {
@@ -1127,6 +1175,38 @@ void State::addInfoFor(BasicBlock &BB) {
continue;
}
+ auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP)
+ return;
+ TypeSize AccessSize = DL.getTypeStoreSize(AccessType);
+ if (!AccessSize.isFixed())
+ return;
+ if (GuaranteedToExecute) {
+ CmpPredicate Pred;
+ Value *A, *B;
+ if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(),
+ Pred, A, B, DL, TLI)) {
+ // The memory access is guaranteed to execute when BB is entered,
+ // hence the constraint holds on entry to BB.
+ WorkList.emplace_back(FactOrCheck::getConditionFact(
+ DT.getNode(I.getParent()), Pred, A, B));
+ }
+ } else {
+ WorkList.emplace_back(
+ FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
+ }
+ };
+
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isVolatile())
+ AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType());
+ }
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isVolatile())
+ AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType());
+ }
+
auto *II = dyn_cast<IntrinsicInst>(&I);
Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
switch (ID) {
@@ -1420,7 +1500,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A,
LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n");
auto R = Info.getConstraintForSolving(Pred, A, B);
- if (R.empty() || !R.isValid(Info)){
+ if (R.empty() || !R.isValid(Info)) {
LLVM_DEBUG(dbgs() << " failed to decompose condition\n");
return std::nullopt;
}
@@ -1785,12 +1865,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
ScalarEvolution &SE,
- OptimizationRemarkEmitter &ORE) {
+ OptimizationRemarkEmitter &ORE,
+ TargetLibraryInfo &TLI) {
bool Changed = false;
DT.updateDFSNumbers();
SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args()));
ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
- State S(DT, LI, SE);
+ State S(DT, LI, SE, TLI);
std::unique_ptr<Module> ReproducerModule(
DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
@@ -1960,6 +2041,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
}
continue;
}
+
+ auto &DL = F.getDataLayout();
+ auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) {
+ CmpPredicate Pred;
+ Value *A, *B;
+ if (getConstraintFromMemoryAccess(
+ *cast<GetElementPtrInst>(Ptr),
+ DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL,
+ TLI))
+ AddFact(Pred, A, B);
+ };
+
+ if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) {
+ AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType());
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) {
+ AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType());
+ continue;
+ }
}
Value *A = nullptr, *B = nullptr;
@@ -2018,7 +2119,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F,
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- if (!eliminateConstraints(F, DT, LI, SE, ORE))
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 434b55868c99..944b253e0f5e 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -521,7 +521,7 @@ private:
Instruction *SIUse = dyn_cast<Instruction>(SI->user_back());
// The use of the select inst should be either a phi or another select.
- if (!SIUse && !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
+ if (!SIUse || !(isa<PHINode>(SIUse) || isa<SelectInst>(SIUse)))
return false;
BasicBlock *SIBB = SI->getParent();
@@ -581,15 +581,17 @@ struct AllSwitchPaths {
VisitedBlocks VB;
// Get paths from the determinator BBs to SwitchPhiDefBB
std::vector<ThreadingPath> PathsToPhiDef =
- getPathsFromStateDefMap(StateDef, SwitchPhi, VB);
- if (SwitchPhiDefBB == SwitchBlock) {
+ getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+ if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
TPaths = std::move(PathsToPhiDef);
return;
}
+ assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
+ auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
// Find and append paths from SwitchPhiDefBB to SwitchBlock.
PathsType PathsToSwitchBB =
- paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1);
+ paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
if (PathsToSwitchBB.empty())
return;
@@ -610,13 +612,16 @@ private:
typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap;
std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef,
PHINode *Phi,
- VisitedBlocks &VB) {
+ VisitedBlocks &VB,
+ unsigned PathsLimit) {
std::vector<ThreadingPath> Res;
auto *PhiBB = Phi->getParent();
VB.insert(PhiBB);
VisitedBlocks UniqueBlocks;
for (auto *IncomingBB : Phi->blocks()) {
+ if (Res.size() >= PathsLimit)
+ break;
if (!UniqueBlocks.insert(IncomingBB).second)
continue;
if (!SwitchOuterLoop->contains(IncomingBB))
@@ -652,8 +657,9 @@ private:
// Direct predecessor, just add to the path.
if (IncomingPhiDefBB == IncomingBB) {
- std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+ assert(PathsLimit > Res.size());
+ std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap(
+ StateDef, IncomingPhi, VB, PathsLimit - Res.size());
for (ThreadingPath &Path : PredPaths) {
Path.push_back(PhiBB);
Res.push_back(std::move(Path));
@@ -666,13 +672,17 @@ private:
continue;
PathsType IntermediatePaths;
- IntermediatePaths =
- paths(IncomingPhiDefBB, IncomingBB, VB, /* PathDepth = */ 1);
+ assert(PathsLimit > Res.size());
+ auto InterPathLimit = PathsLimit - Res.size();
+ IntermediatePaths = paths(IncomingPhiDefBB, IncomingBB, VB,
+ /* PathDepth = */ 1, InterPathLimit);
if (IntermediatePaths.empty())
continue;
+ assert(InterPathLimit >= IntermediatePaths.size());
+ auto PredPathLimit = InterPathLimit / IntermediatePaths.size();
std::vector<ThreadingPath> PredPaths =
- getPathsFromStateDefMap(StateDef, IncomingPhi, VB);
+ getPathsFromStateDefMap(StateDef, IncomingPhi, VB, PredPathLimit);
for (const ThreadingPath &Path : PredPaths) {
for (const PathType &IPath : IntermediatePaths) {
ThreadingPath NewPath(Path);
@@ -687,7 +697,7 @@ private:
}
PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited,
- unsigned PathDepth) {
+ unsigned PathDepth, unsigned PathsLimit) {
PathsType Res;
// Stop exploring paths after visiting MaxPathLength blocks
@@ -714,6 +724,8 @@ private:
// is used to prevent a duplicate path from being generated
SmallPtrSet<BasicBlock *, 4> Successors;
for (BasicBlock *Succ : successors(BB)) {
+ if (Res.size() >= PathsLimit)
+ break;
if (!Successors.insert(Succ).second)
continue;
@@ -735,14 +747,12 @@ private:
// coverage and compile time.
if (LI->getLoopFor(Succ) != CurrLoop)
continue;
-
- PathsType SuccPaths = paths(Succ, ToBB, Visited, PathDepth + 1);
+ assert(PathsLimit > Res.size());
+ PathsType SuccPaths =
+ paths(Succ, ToBB, Visited, PathDepth + 1, PathsLimit - Res.size());
for (PathType &Path : SuccPaths) {
Path.push_front(BB);
Res.push_back(Path);
- if (Res.size() >= MaxNumPaths) {
- return Res;
- }
}
}
// This block could now be visited again from a different predecessor. Note
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 4baa3b3eb824..26e17cc849bf 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2982,7 +2982,8 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
bool GVNPass::performScalarPRE(Instruction *CurInst) {
if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
- CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects())
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ CurInst->getType()->isTokenLikeTy())
return false;
// Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index e9bf59c6850a..b60b15b6c3a2 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -15,6 +15,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -35,8 +36,38 @@ static bool tryToImproveAlign(
return true;
}
}
- // TODO: Also handle memory intrinsics.
- return false;
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+
+ // TODO: Handle more memory intrinsics.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::masked_load:
+ case Intrinsic::masked_store: {
+ int AlignOpIdx = II->getIntrinsicID() == Intrinsic::masked_load ? 1 : 2;
+ Value *PtrOp = II->getIntrinsicID() == Intrinsic::masked_load
+ ? II->getArgOperand(0)
+ : II->getArgOperand(1);
+ Type *Type = II->getIntrinsicID() == Intrinsic::masked_load
+ ? II->getType()
+ : II->getArgOperand(0)->getType();
+
+ Align OldAlign =
+ cast<ConstantInt>(II->getArgOperand(AlignOpIdx))->getAlignValue();
+ Align PrefAlign = DL.getPrefTypeAlign(Type);
+ Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
+ if (NewAlign <= OldAlign)
+ return false;
+
+ Value *V =
+ ConstantInt::get(Type::getInt32Ty(II->getContext()), NewAlign.value());
+ II->setOperand(AlignOpIdx, V);
+ return true;
+ }
+ default:
+ return false;
+ }
}
bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) {
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index c2a737d8f9a4..c7d71eb5633e 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1437,9 +1437,18 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
// AvailablePreds vector as we go so that all of the PHI entries for this
// predecessor use the same bitcast.
Value *&PredV = I->second;
- if (PredV->getType() != LoadI->getType())
+ if (PredV->getType() != LoadI->getType()) {
PredV = CastInst::CreateBitOrPointerCast(
PredV, LoadI->getType(), "", P->getTerminator()->getIterator());
+ // The new cast is producing the value used to replace the load
+ // instruction, so uses the load's debug location. If P does not always
+ // branch to the load BB however then the debug location must be dropped,
+ // as it is hoisted past a conditional branch.
+ DebugLoc DL = P->getTerminator()->getNumSuccessors() == 1
+ ? LoadI->getDebugLoc()
+ : DebugLoc::getDropped();
+ cast<CastInst>(PredV)->setDebugLoc(DL);
+ }
PN->addIncoming(PredV, I->first);
}
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 03b92d3338a9..0874b29ab7d2 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -39,6 +39,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/HashRecognize.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -143,6 +144,14 @@ static cl::opt<bool, true>
cl::location(DisableLIRP::Wcslen), cl::init(false),
cl::ReallyHidden);
+bool DisableLIRP::HashRecognize;
+static cl::opt<bool, true>
+ DisableLIRPHashRecognize("disable-" DEBUG_TYPE "-hashrecognize",
+ cl::desc("Proceed with loop idiom recognize pass, "
+ "but do not optimize CRC loops."),
+ cl::location(DisableLIRP::HashRecognize),
+ cl::init(false), cl::ReallyHidden);
+
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
cl::desc("Use loop idiom recognition code size heuristics when compiling "
@@ -242,6 +251,7 @@ private:
const SCEV *BECount);
bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
bool IsLoopMemset = false);
+ bool optimizeCRCLoop(const PolynomialInfo &Info);
/// @}
/// \name Noncountable Loop Idiom Handling
@@ -287,6 +297,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+ std::optional<PolynomialInfo> HR;
+
LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
AR.MSSA, DL, ORE);
if (!LIR.runOnLoop(&L))
@@ -335,7 +347,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);
- if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy)
+ if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic ||
+ HasMemcpy || !DisableLIRP::HashRecognize)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();
@@ -378,6 +391,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
}
+
+ // Optimize a CRC loop if HashRecognize found one, provided we're not
+ // optimizing for size.
+ if (!DisableLIRP::HashRecognize && !ApplyCodeSizeHeuristics)
+ if (auto Res = HashRecognize(*CurLoop, *SE).getResult())
+ optimizeCRCLoop(*Res);
+
return MadeChange;
}
@@ -1514,6 +1534,160 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
return false;
}
+bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
+ // FIXME: Hexagon has a special HexagonLoopIdiom that optimizes CRC using
+ // carry-less multiplication instructions, which is more efficient than our
+ // Sarwate table-lookup optimization. Hence, until we're able to emit
+ // target-specific instructions for Hexagon, subsuming HexagonLoopIdiom,
+ // disable the optimization for Hexagon.
+ Module &M = *CurLoop->getHeader()->getModule();
+ Triple TT(M.getTargetTriple());
+ if (TT.getArch() == Triple::hexagon)
+ return false;
+
+ // First, create a new GlobalVariable corresponding to the
+ // Sarwate-lookup-table.
+ Type *CRCTy = Info.LHS->getType();
+ unsigned CRCBW = CRCTy->getIntegerBitWidth();
+ std::array<Constant *, 256> CRCConstants;
+ transform(HashRecognize::genSarwateTable(Info.RHS, Info.ByteOrderSwapped),
+ CRCConstants.begin(),
+ [CRCTy](const APInt &E) { return ConstantInt::get(CRCTy, E); });
+ Constant *ConstArray =
+ ConstantArray::get(ArrayType::get(CRCTy, 256), CRCConstants);
+ GlobalVariable *GV =
+ new GlobalVariable(M, ConstArray->getType(), true,
+ GlobalValue::PrivateLinkage, ConstArray, ".crctable");
+
+ PHINode *IV = CurLoop->getCanonicalInductionVariable();
+ SmallVector<PHINode *, 2> Cleanup;
+
+ // Next, mark all PHIs for removal except IV.
+ {
+ for (PHINode &PN : CurLoop->getHeader()->phis()) {
+ if (&PN == IV)
+ continue;
+ PN.replaceAllUsesWith(PoisonValue::get(PN.getType()));
+ Cleanup.push_back(&PN);
+ }
+ }
+
+ // Next, fix up the trip count.
+ {
+ unsigned NewBTC = (Info.TripCount / 8) - 1;
+ BasicBlock *LoopBlk = CurLoop->getLoopLatch();
+ BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
+ CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
+ ? ICmpInst::Predicate::ICMP_NE
+ : ICmpInst::Predicate::ICMP_EQ;
+ Instruction *ExitCond = CurLoop->getLatchCmpInst();
+ Value *ExitLimit = ConstantInt::get(IV->getType(), NewBTC);
+ IRBuilder<> Builder(ExitCond);
+ Value *NewExitCond =
+ Builder.CreateICmp(ExitPred, IV, ExitLimit, "exit.cond");
+ ExitCond->replaceAllUsesWith(NewExitCond);
+ deleteDeadInstruction(ExitCond);
+ }
+
+ // Finally, fill the loop with the Sarwate-table-lookup logic, and replace all
+ // uses of ComputedValue.
+ //
+ // Little-endian:
+ // crc = (crc >> 8) ^ tbl[(iv'th byte of data) ^ (bottom byte of crc)]
+ // Big-Endian:
+ // crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
+ {
+ auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
+ Type *OpTy = Op->getType();
+ unsigned OpBW = OpTy->getIntegerBitWidth();
+ return OpBW > 8
+ ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name)
+ : Op;
+ };
+ auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
+ const Twine &Name) {
+ Type *OpTy = Op->getType();
+
+ // When the bitwidth of the CRC mismatches the Op's bitwidth, we need to
+ // use the CRC's bitwidth as the reference for shifting right.
+ return LoByte(Builder,
+ CRCBW > 8 ? Builder.CreateLShr(
+ Op, ConstantInt::get(OpTy, CRCBW - 8), Name)
+ : Op,
+ Name + ".lo.byte");
+ };
+
+ IRBuilder<> Builder(CurLoop->getHeader(),
+ CurLoop->getHeader()->getFirstNonPHIIt());
+
+ // Create the CRC PHI, and initialize its incoming value to the initial
+ // value of CRC.
+ PHINode *CRCPhi = Builder.CreatePHI(CRCTy, 2, "crc");
+ CRCPhi->addIncoming(Info.LHS, CurLoop->getLoopPreheader());
+
+ // CRC is now an evolving variable, initialized to the PHI.
+ Value *CRC = CRCPhi;
+
+ // TableIndexer = ((top|bottom) byte of CRC). It is XOR'ed with (iv'th byte
+ // of LHSAux), if LHSAux is non-nullptr.
+ Value *Indexer = CRC;
+ if (Value *Data = Info.LHSAux) {
+ Type *DataTy = Data->getType();
+
+ // To index into the (iv'th byte of LHSAux), we multiply iv by 8, and we
+ // shift right by that amount, and take the lo-byte (in the little-endian
+ // case), or shift left by that amount, and take the hi-idx (in the
+ // big-endian case).
+ Value *IVBits = Builder.CreateZExtOrTrunc(
+ Builder.CreateShl(IV, 3, "iv.bits"), DataTy, "iv.indexer");
+ Value *DataIndexer =
+ Info.ByteOrderSwapped
+ ? Builder.CreateShl(Data, IVBits, "data.indexer")
+ : Builder.CreateLShr(Data, IVBits, "data.indexer");
+ Indexer = Builder.CreateXor(
+ DataIndexer,
+ Builder.CreateZExtOrTrunc(Indexer, DataTy, "crc.indexer.cast"),
+ "crc.data.indexer");
+ }
+
+ Indexer = Info.ByteOrderSwapped ? HiIdx(Builder, Indexer, "indexer.hi")
+ : LoByte(Builder, Indexer, "indexer.lo");
+
+ // Always index into a GEP using the index type.
+ Indexer = Builder.CreateZExt(
+ Indexer, SE->getDataLayout().getIndexType(GV->getType()),
+ "indexer.ext");
+
+ // CRCTableLd = CRCTable[(iv'th byte of data) ^ (top|bottom) byte of CRC].
+ Value *CRCTableGEP =
+ Builder.CreateInBoundsGEP(CRCTy, GV, Indexer, "tbl.ptradd");
+ Value *CRCTableLd = Builder.CreateLoad(CRCTy, CRCTableGEP, "tbl.ld");
+
+ // CRCNext = (CRC (<<|>>) 8) ^ CRCTableLd, or simply CRCTableLd in case of
+ // CRC-8.
+ Value *CRCNext = CRCTableLd;
+ if (CRCBW > 8) {
+ Value *CRCShift = Info.ByteOrderSwapped
+ ? Builder.CreateShl(CRC, 8, "crc.be.shift")
+ : Builder.CreateLShr(CRC, 8, "crc.le.shift");
+ CRCNext = Builder.CreateXor(CRCShift, CRCTableLd, "crc.next");
+ }
+
+ // Connect the back-edge for the loop, and RAUW the ComputedValue.
+ CRCPhi->addIncoming(CRCNext, CurLoop->getLoopLatch());
+ Info.ComputedValue->replaceUsesOutsideBlock(CRCNext,
+ CurLoop->getLoopLatch());
+ }
+
+ // Cleanup.
+ {
+ for (PHINode *PN : Cleanup)
+ RecursivelyDeleteDeadPHINode(PN);
+ SE->forgetLoop(CurLoop);
+ }
+ return true;
+}
+
bool LoopIdiomRecognize::runOnNoncountableLoop() {
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName()
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f7d2258e1c28..2bda9d83236e 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
UP.RuntimeUnrollMultiExit = false;
+ UP.AddAdditionalAccumulators = false;
// Override with any target specific settings
TTI.getUnrollingPreferences(L, SE, UP, &ORE);
@@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ULO.Heart = getLoopConvergenceHeart(L);
ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
+ ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
LoopUnrollResult UnrollResult = UnrollLoop(
L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
if (UnrollResult == LoopUnrollResult::Unmodified)
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 8b9d06d7e443..8a5569743ab4 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -247,8 +247,8 @@ private:
/// index I' according to UserChain produced by function "find".
///
/// The building conceptually takes two steps:
- /// 1) iteratively distribute s/zext towards the leaves of the expression tree
- /// that computes I
+ /// 1) iteratively distribute sext/zext/trunc towards the leaves of the
+ /// expression tree that computes I
/// 2) reassociate the expression tree to the form I' + C.
///
/// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
@@ -260,29 +260,30 @@ private:
Value *rebuildWithoutConstOffset();
/// After the first step of rebuilding the GEP index without the constant
- /// offset, distribute s/zext to the operands of all operators in UserChain.
- /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+ /// offset, distribute sext/zext/trunc to the operands of all operators in
+ /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
/// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
///
/// The function also updates UserChain to point to new subexpressions after
- /// distributing s/zext. e.g., the old UserChain of the above example is
- /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+ /// distributing sext/zext/trunc. e.g., the old UserChain of the above example
+ /// is
+ /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
/// and the new UserChain is
- /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
- /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+ /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
///
/// \p ChainIndex The index to UserChain. ChainIndex is initially
/// UserChain.size() - 1, and is decremented during
/// the recursion.
- Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+ Value *distributeCastsAndCloneChain(unsigned ChainIndex);
/// Reassociates the GEP index to the form I' + C and returns I'.
Value *removeConstOffset(unsigned ChainIndex);
- /// A helper function to apply ExtInsts, a list of s/zext, to value V.
- /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+ /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value
+ /// V. e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function
/// returns "sext i32 (zext i16 V to i32) to i64".
- Value *applyExts(Value *V);
+ Value *applyCasts(Value *V);
/// A helper function that returns whether we can trace into the operands
/// of binary operator BO for a constant offset.
@@ -307,8 +308,8 @@ private:
SmallVector<User *, 8> UserChain;
/// A data structure used in rebuildWithoutConstOffset. Contains all
- /// sext/zext instructions along UserChain.
- SmallVector<CastInst *, 16> ExtInsts;
+ /// sext/zext/trunc instructions along UserChain.
+ SmallVector<CastInst *, 16> CastInsts;
/// Insertion position of cloned instructions.
BasicBlock::iterator IP;
@@ -491,7 +492,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
}
Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
- // Do not trace into "or" unless it is equivalent to "add".
+ // Do not trace into "or" unless it is equivalent to "add nuw nsw".
// This is the case if the or's disjoint flag is set.
if (BO->getOpcode() == Instruction::Or &&
!cast<PossiblyDisjointInst>(BO)->isDisjoint())
@@ -503,8 +504,8 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub)
return false;
- // In addition, tracing into BO requires that its surrounding s/zext (if
- // any) is distributable to both operands.
+ // In addition, tracing into BO requires that its surrounding sext/zext/trunc
+ // (if any) is distributable to both operands.
//
// Suppose BO = A op B.
// SignExtended | ZeroExtended | Distributable?
@@ -628,11 +629,11 @@ APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
return ConstantOffset;
}
-Value *ConstantOffsetExtractor::applyExts(Value *V) {
+Value *ConstantOffsetExtractor::applyCasts(Value *V) {
Value *Current = V;
- // ExtInsts is built in the use-def order. Therefore, we apply them to V
+ // CastInsts is built in the use-def order. Therefore, we apply them to V
// in the reversed order.
- for (CastInst *I : llvm::reverse(ExtInsts)) {
+ for (CastInst *I : llvm::reverse(CastInsts)) {
if (Constant *C = dyn_cast<Constant>(Current)) {
// Try to constant fold the cast.
Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL);
@@ -640,24 +641,24 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
continue;
}
- Instruction *Ext = I->clone();
- Ext->setOperand(0, Current);
+ Instruction *Cast = I->clone();
+ Cast->setOperand(0, Current);
// In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so
// we assume that it is ok to redistribute trunc over add/sub/or. But for
// example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc
// nuw (add A, B))). To make such redistributions legal we drop all the
// poison generating flags from cloned trunc instructions here.
- if (isa<TruncInst>(Ext))
- Ext->dropPoisonGeneratingFlags();
- Ext->insertBefore(*IP->getParent(), IP);
- Current = Ext;
+ if (isa<TruncInst>(Cast))
+ Cast->dropPoisonGeneratingFlags();
+ Cast->insertBefore(*IP->getParent(), IP);
+ Current = Cast;
}
return Current;
}
Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
- distributeExtsAndCloneChain(UserChain.size() - 1);
- // Remove all nullptrs (used to be s/zext) from UserChain.
+ distributeCastsAndCloneChain(UserChain.size() - 1);
+ // Remove all nullptrs (used to be sext/zext/trunc) from UserChain.
unsigned NewSize = 0;
for (User *I : UserChain) {
if (I != nullptr) {
@@ -670,29 +671,29 @@ Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
}
Value *
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) {
User *U = UserChain[ChainIndex];
if (ChainIndex == 0) {
assert(isa<ConstantInt>(U));
- // If U is a ConstantInt, applyExts will return a ConstantInt as well.
- return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+ // If U is a ConstantInt, applyCasts will return a ConstantInt as well.
+ return UserChain[ChainIndex] = cast<ConstantInt>(applyCasts(U));
}
if (CastInst *Cast = dyn_cast<CastInst>(U)) {
assert(
(isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
"Only following instructions can be traced: sext, zext & trunc");
- ExtInsts.push_back(Cast);
+ CastInsts.push_back(Cast);
UserChain[ChainIndex] = nullptr;
- return distributeExtsAndCloneChain(ChainIndex - 1);
+ return distributeCastsAndCloneChain(ChainIndex - 1);
}
// Function find only trace into BinaryOperator and CastInst.
BinaryOperator *BO = cast<BinaryOperator>(U);
// OpNo = which operand of BO is UserChain[ChainIndex - 1]
unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
- Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
- Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+ Value *TheOther = applyCasts(BO->getOperand(1 - OpNo));
+ Value *NextInChain = distributeCastsAndCloneChain(ChainIndex - 1);
BinaryOperator *NewBO = nullptr;
if (OpNo == 0) {
@@ -713,7 +714,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
assert((BO->use_empty() || BO->hasOneUse()) &&
- "distributeExtsAndCloneChain clones each BinaryOperator in "
+ "distributeCastsAndCloneChain clones each BinaryOperator in "
"UserChain, so no one should be used more than "
"once");
@@ -847,7 +848,8 @@ static bool allowsPreservingNUW(const User *U) {
// "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
if (const TruncInst *TI = dyn_cast<TruncInst>(U))
return TI->hasNoUnsignedWrap();
- return isa<CastInst>(U) || isa<ConstantInt>(U);
+ assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User.");
+ return true;
}
Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc03da6b..e4ba70d1bce1 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier(
static cl::opt<int> UnswitchSiblingsToplevelDiv(
"unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchParentBlocksDiv(
+ "unswitch-parent-blocks-div", cl::init(8), cl::Hidden,
+ cl::desc("Outer loop size divisor for cost multiplier."));
static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
"unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
cl::desc("Number of unswitch candidates that are ignored when calculating "
@@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier(
return 1;
}
+ // Each invariant non-trivial condition, after being unswitched, is supposed
+ // to have its own specialized sibling loop (the invariant condition has been
+ // hoisted out of the child loop into a newly-cloned loop). When unswitching
+ // conditions in nested loops, the basic block size of the outer loop should
+ // not be altered. If such a size significantly increases across unswitching
+ // invocations, something may be wrong; so adjust the final cost taking this
+ // into account.
auto *ParentL = L.getParentLoop();
+ int ParentLoopSizeMultiplier = 1;
+ if (ParentL)
+ ParentLoopSizeMultiplier =
+ std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1);
+
int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
: std::distance(LI.begin(), LI.end()));
// Count amount of clones that all the candidates might cause during
@@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier(
// at an upper bound.
int CostMultiplier;
if (ClonesPower > Log2_32(UnswitchThreshold) ||
- SiblingsMultiplier > UnswitchThreshold)
+ SiblingsMultiplier > UnswitchThreshold ||
+ ParentLoopSizeMultiplier > UnswitchThreshold)
CostMultiplier = UnswitchThreshold;
else
CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
(int)UnswitchThreshold);
LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
- << " (siblings " << SiblingsMultiplier << " * clones "
+ << " (siblings " << SiblingsMultiplier << " * parent size "
+ << ParentLoopSizeMultiplier << " * clones "
<< (1 << ClonesPower) << ")"
<< " for unswitch candidate: " << TI << "\n");
return CostMultiplier;
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index bb7dbc2980f5..e05625344ee2 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -997,7 +997,8 @@ void StructurizeCFG::simplifyHoistedPhis() {
continue;
OtherPhi->setIncomingValue(PoisonValBBIdx, V);
- Phi->setIncomingValue(i, OtherV);
+ if (DT->dominates(OtherV, Phi))
+ Phi->setIncomingValue(i, OtherV);
}
}
}