diff options
| author | NAKAMURA Takumi <geek4civic@gmail.com> | 2025-01-09 18:49:54 +0900 |
|---|---|---|
| committer | NAKAMURA Takumi <geek4civic@gmail.com> | 2025-01-09 18:49:54 +0900 |
| commit | e2810c9a248f4c7fbfae84bb32b6f7e01027458b (patch) | |
| tree | ae0b02a8491b969a1cee94ea16ffe42c559143c5 /llvm/lib/Transforms/Vectorize/VectorCombine.cpp | |
| parent | fa04eb4af95c1ca7377279728cb004bcd2324d01 (diff) | |
| parent | bdcf47e4bcb92889665825654bb80a8bbe30379e (diff) | |
Merge branch 'users/chapuni/cov/single/base' into users/chapuni/cov/single/switchusers/chapuni/cov/single/switch
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VectorCombine.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 97 |
1 files changed, 75 insertions, 22 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index ecbc13d489eb..1a669b5058e7 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -128,6 +128,8 @@ private: bool shrinkType(Instruction &I); void replaceValue(Value &Old, Value &New) { + LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); + LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); if (auto *NewI = dyn_cast<Instruction>(&New)) { New.takeName(&Old); @@ -139,10 +141,17 @@ private: void eraseInstruction(Instruction &I) { LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n'); - for (Value *Op : I.operands()) - Worklist.pushValue(Op); + SmallVector<Value *> Ops(I.operands()); Worklist.remove(&I); I.eraseFromParent(); + + // Push remaining users of the operands and then the operand itself - allows + // further folds that were hindered by OneUse limits. + for (Value *Op : Ops) + if (auto *OpI = dyn_cast<Instruction>(Op)) { + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } }; } // namespace @@ -696,7 +705,8 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind); + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask, + CostKind); bool NeedLenChg = SrcVecTy->getNumElements() != NumElts; // If the lengths of the two vectors are not equal, @@ -1335,6 +1345,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { MemoryLocation::get(SI), AA)) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(Load); + if (ScalarizableIdx.isSafeWithFreeze()) ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx)); Value *GEP = Builder.CreateInBoundsGEP( @@ -1360,8 +1374,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *VecTy = cast<VectorType>(I.getType()); auto *LI = cast<LoadInst>(&I); + auto *VecTy = cast<VectorType>(LI->getType()); if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType())) return false; @@ -1401,7 +1415,8 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { LastCheckedInst = UI; } - auto ScalarIdx = canScalarizeAccess(VecTy, UI->getOperand(1), &I, AC, DT); + auto ScalarIdx = + canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT); if (ScalarIdx.isUnsafe()) return false; if (ScalarIdx.isSafeWithFreeze()) { @@ -1409,7 +1424,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { ScalarIdx.discard(); } - auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1)); + auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand()); OriginalCost += TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, Index ? Index->getZExtValue() : -1); @@ -1422,10 +1437,14 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (ScalarizedCost >= OriginalCost) return false; + // Ensure we add the load back to the worklist BEFORE its users so they can + // erased in the correct order. + Worklist.push(LI); + // Replace extracts with narrow scalar loads. for (User *U : LI->users()) { auto *EI = cast<ExtractElementInst>(U); - Value *Idx = EI->getOperand(1); + Value *Idx = EI->getIndexOperand(); // Insert 'freeze' for poison indexes. auto It = NeedFreeze.find(EI); @@ -1669,7 +1688,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { Value *X, *Y, *Z, *W; bool IsCommutative = false; - CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE; + CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE; if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) && match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) { auto *BO = cast<BinaryOperator>(LHS); @@ -1677,8 +1697,9 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem()) return false; IsCommutative = BinaryOperator::isCommutative(BO->getOpcode()); - } else if (match(LHS, m_Cmp(Pred, m_Value(X), m_Value(Y))) && - match(RHS, m_SpecificCmp(Pred, m_Value(Z), m_Value(W)))) { + } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) && + match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) && + (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) { IsCommutative = cast<CmpInst>(LHS)->isCommutative(); } else return false; @@ -1723,18 +1744,48 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I); + // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns + // where one use shuffles have gotten split across the binop/cmp. These + // often allow a major reduction in total cost that wouldn't happen as + // individual folds. + auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask, + TTI::TargetCostKind CostKind) -> bool { + Value *InnerOp; + ArrayRef<int> InnerMask; + if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(), + m_Mask(InnerMask)))) && + InnerOp->getType() == Op->getType() && + all_of(InnerMask, + [NumSrcElts](int M) { return M < (int)NumSrcElts; })) { + for (int &M : Mask) + if (Offset <= M && M < (int)(Offset + NumSrcElts)) { + M = InnerMask[M - Offset]; + M = 0 <= M ? M + Offset : M; + } + OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind); + Op = InnerOp; + return true; + } + return false; + }; + bool ReducedInstCount = false; + ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind); + ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind); + ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind); + ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind); + InstructionCost NewCost = TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) + TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W}); - if (Pred == CmpInst::BAD_ICMP_PREDICATE) { + if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) { NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind); } else { auto *ShuffleCmpTy = FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy); NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, - ShuffleDstTy, Pred, CostKind); + ShuffleDstTy, PredLHS, CostKind); } LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I @@ -1743,17 +1794,17 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { // If either shuffle will constant fold away, then fold for the same cost as // we will reduce the instruction count. - bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) || - (isa<Constant>(Y) && isa<Constant>(W)); + ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) || + (isa<Constant>(Y) && isa<Constant>(W)); if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost)) return false; Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0); Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1); - Value *NewBO = Pred == CmpInst::BAD_ICMP_PREDICATE + Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE ? Builder.CreateBinOp( cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1) - : Builder.CreateCmp(Pred, Shuf0, Shuf1); + : Builder.CreateCmp(PredLHS, Shuf0, Shuf1); // Intersect flags from the old binops. if (auto *NewInst = dyn_cast<Instruction>(NewBO)) { @@ -1972,9 +2023,7 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) { if (Match1) InnerCost1 = TTI.getInstructionCost(cast<Instruction>(OuterV1), CostKind); - InstructionCost OuterCost = TTI.getShuffleCost( - TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy, OuterMask, CostKind, - 0, nullptr, {OuterV0, OuterV1}, &I); + InstructionCost OuterCost = TTI.getInstructionCost(&I, CostKind); InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost; @@ -3047,12 +3096,16 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); InstructionCost OldCost = ExtCost + InsCost; - InstructionCost NewCost = TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, - nullptr, {DstVec, SrcVec}); + // Ignore 'free' identity insertion shuffle. + // TODO: getShuffleCost should return TCC_Free for Identity shuffles. + InstructionCost NewCost = 0; + if (!ShuffleVectorInst::isIdentityMask(Mask, NumElts)) + NewCost += TTI.getShuffleCost(SK, VecTy, Mask, CostKind, 0, nullptr, + {DstVec, SrcVec}); if (!Ext->hasOneUse()) NewCost += ExtCost; - LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I + LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"); |
