diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 310 |
1 files changed, 212 insertions, 98 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 065622efc7ec..048a3e691fe5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1100,7 +1100,9 @@ class BinOpSameOpcodeHelper { // constant + x cannot be -constant - x // instead, it should be x - -constant if (Pos == 1 || - (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub)) + ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or || + FromOpcode == Instruction::Xor) && + ToOpcode == Instruction::Sub)) return SmallVector<Value *>({LHS, RHS}); return SmallVector<Value *>({RHS, LHS}); } @@ -1188,6 +1190,10 @@ public: if (CIValue.isAllOnes()) InterchangeableMask = CanBeAll; break; + case Instruction::Xor: + if (CIValue.isZero()) + InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT; + break; default: if (CIValue.isZero()) InterchangeableMask = CanBeAll; @@ -2099,6 +2105,7 @@ public: UserIgnoreList = nullptr; PostponedGathers.clear(); ValueToGatherNodes.clear(); + TreeEntryToStridedPtrInfoMap.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -2234,10 +2241,70 @@ public: /// TODO: If load combining is allowed in the IR optimizer, this analysis /// may not be necessary. bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; - bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, - ArrayRef<unsigned> Order, const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, - const int64_t Diff, StridedPtrInfo &SPtrInfo) const; + bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, + Align Alignment, const int64_t Diff, + const size_t Sz) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with constant stride). + /// + /// TODO: + /// It is possible that the load gets "widened". Suppose that originally each + /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is + /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + (w - 1) + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + (w - 1) + /// ... + /// + /// %b + (n - 1) * %s + 0 + /// %b + (n - 1) * %s + 1 + /// %b + (n - 1) * %s + 2 + /// ... + /// %b + (n - 1) * %s + (w - 1) + /// + /// In this case we will generate a strided load of type `<n x (k * w)>`. + /// + /// \param PointerOps list of pointer arguments of loads. + /// \param ElemTy original scalar type of loads. + /// \param Alignment alignment of the first load. + /// \param SortedIndices is the order of PointerOps as returned by + /// `sortPtrAccesses` + /// \param Diff Pointer difference between the lowest and the highes pointer + /// in `PointerOps` as returned by `getPointersDiff`. + /// \param Ptr0 first pointer in `PointersOps`. + /// \param PtrN last pointer in `PointersOps`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const; + + /// Return true if an array of scalar loads can be replaced with a strided + /// load (with run-time stride). + /// \param PointerOps list of pointer arguments of loads. + /// \param ScalarTy type of loads. + /// \param CommonAlignment common alignement of loads as computed by + /// `computeCommonAlignment<LoadInst>`. + /// \param SortedIndicies is a list of indicies computed by this function such + /// that the sequence `PointerOps[SortedIndices[0]], + /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is + /// ordered by the coefficient of the stride. For example, if PointerOps is + /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be + /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be + /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`. + /// \param SPtrInfo If the function return `true`, it also sets all the fields + /// of `SPtrInfo` necessary to generate the strided load later. + bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy, + Align CommonAlignment, + SmallVectorImpl<unsigned> &SortedIndices, + StridedPtrInfo &SPtrInfo) const; /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. @@ -5265,19 +5332,19 @@ private: // data. for (TreeEntry *TE : Entries) { // Check if the user is commutative. - // The commutatives are handled later, as their oeprands can be + // The commutatives are handled later, as their operands can be // reordered. // Same applies even for non-commutative cmps, because we can invert // their predicate potentially and, thus, reorder the operands. bool IsCommutativeUser = ::isCommutative(User) || ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User); - EdgeInfo EI(TE, U.getOperandNo()); if (!IsCommutativeUser && !isa<CmpInst>(User)) { unsigned &OpCnt = OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); - if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) - return false; + EdgeInfo EI(TE, U.getOperandNo()); + if (!getScheduleCopyableData(EI, Op)) + continue; // Found copyable operand - continue. ++OpCnt; continue; @@ -5286,33 +5353,38 @@ private: .first->getSecond(); } } - // Check the commutative/cmp entries. - if (!PotentiallyReorderedEntriesCount.empty()) { - for (auto &P : PotentiallyReorderedEntriesCount) { - auto *It = find(P.first->Scalars, User); - assert(It != P.first->Scalars.end() && - "User is not in the tree entry"); - int Lane = std::distance(P.first->Scalars.begin(), It); - assert(Lane >= 0 && "Lane is not found"); - if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty()) - Lane = P.first->ReorderIndices[Lane]; - assert(Lane < static_cast<int>(P.first->Scalars.size()) && - "Couldn't find extract lane"); - SmallVector<unsigned> OpIndices; - for (unsigned OpIdx : - seq<unsigned>(::getNumberOfPotentiallyCommutativeOps( - P.first->getMainOp()))) { - if (P.first->getOperand(OpIdx)[Lane] == Op && - getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) - --P.getSecond(); - } - } - return all_of(PotentiallyReorderedEntriesCount, + if (PotentiallyReorderedEntriesCount.empty()) + return all_of(OrderedEntriesCount, [&](const std::pair<const TreeEntry *, unsigned> &P) { - return P.second == NumOps - 1; + return P.second == NumOps; }); - } - return true; + // Check the commutative/cmp entries. + for (auto &P : PotentiallyReorderedEntriesCount) { + auto *It = find(P.first->Scalars, User); + assert(It != P.first->Scalars.end() && "User is not in the tree entry"); + int Lane = std::distance(P.first->Scalars.begin(), It); + assert(Lane >= 0 && "Lane is not found"); + if (isa<StoreInst>(User) && !P.first->ReorderIndices.empty()) + Lane = P.first->ReorderIndices[Lane]; + assert(Lane < static_cast<int>(P.first->Scalars.size()) && + "Couldn't find extract lane"); + SmallVector<unsigned> OpIndices; + for (unsigned OpIdx : + seq<unsigned>(::getNumberOfPotentiallyCommutativeOps( + P.first->getMainOp()))) { + if (P.first->getOperand(OpIdx)[Lane] == Op && + getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) + --P.getSecond(); + } + } + return all_of(PotentiallyReorderedEntriesCount, + [&](const std::pair<const TreeEntry *, unsigned> &P) { + return P.second == NumOps - 1; + }) && + all_of(OrderedEntriesCount, + [&](const std::pair<const TreeEntry *, unsigned> &P) { + return P.second == NumOps; + }); } SmallVector<ScheduleCopyableData *> @@ -6817,13 +6889,9 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, /// 4. Any pointer operand is an instruction with the users outside of the /// current graph (for masked gathers extra extractelement instructions /// might be required). -bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, - ArrayRef<unsigned> Order, - const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, - const int64_t Diff, - StridedPtrInfo &SPtrInfo) const { - const size_t Sz = VL.size(); +bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy, + Align Alignment, const int64_t Diff, + const size_t Sz) const { if (Diff % (Sz - 1) != 0) return false; @@ -6835,7 +6903,6 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, }); const uint64_t AbsoluteDiff = std::abs(Diff); - Type *ScalarTy = VL.front()->getType(); auto *VecTy = getWidenedType(ScalarTy, Sz); if (IsAnyPointerUsedOutGraph || (AbsoluteDiff > Sz && @@ -6846,40 +6913,60 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); if (Diff != Stride * static_cast<int64_t>(Sz - 1)) return false; - Align Alignment = - cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) - ->getAlign(); - if (!TTI.isLegalStridedLoadStore(VecTy, Alignment)) + if (!TTI->isLegalStridedLoadStore(VecTy, Alignment)) return false; - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; - } - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet<int64_t, 4> Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) - break; - } - if (Dists.size() == Sz) { - Type *StrideTy = DL.getIndexType(Ptr0->getType()); - SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); - SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); - return true; - } + return true; + } + return false; +} + +bool BoUpSLP::analyzeConstantStrideCandidate( + const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment, + const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff, + Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const { + const size_t Sz = PointerOps.size(); + if (!isStridedLoad(PointerOps, ScalarTy, Alignment, Diff, Sz)) + return false; + + int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); + + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet<int64_t, 4> Dists; + for (Value *Ptr : PointerOps) { + int64_t Dist = 0; + if (Ptr == PtrN) + Dist = Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) { + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride); + SPtrInfo.Ty = getWidenedType(ScalarTy, Sz); + return true; + } + return false; +} + +bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, + Type *ScalarTy, Align CommonAlignment, + SmallVectorImpl<unsigned> &SortedIndices, + StridedPtrInfo &SPtrInfo) const { + const unsigned Sz = PointerOps.size(); + FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, Sz); + if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(StridedLoadTy) || + !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)) + return false; + if (const SCEV *Stride = + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, SortedIndices)) { + SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size()); + SPtrInfo.StrideSCEV = Stride; + return true; } return false; } @@ -6924,15 +7011,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( auto *VecTy = getWidenedType(ScalarTy, Sz); Align CommonAlignment = computeCommonAlignment<LoadInst>(VL); if (!IsSorted) { - if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) { - if (const SCEV *Stride = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order); - Stride && TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) { - SPtrInfo.Ty = getWidenedType(ScalarTy, PointerOps.size()); - SPtrInfo.StrideSCEV = Stride; - return LoadsState::StridedVectorize; - } - } + if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order, + SPtrInfo)) + return LoadsState::StridedVectorize; if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) @@ -6964,7 +7045,11 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( cast<Instruction>(V), UserIgnoreList); })) return LoadsState::CompressVectorize; - if (isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo)) + Align Alignment = + cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]) + ->getAlign(); + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, Order, + *Diff, Ptr0, PtrN, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -8942,6 +9027,8 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { void BoUpSLP::buildTree(ArrayRef<Value *> Roots, const SmallDenseSet<Value *> &UserIgnoreLst) { deleteTree(); + assert(TreeEntryToStridedPtrInfoMap.empty() && + "TreeEntryToStridedPtrInfoMap is not cleared"); UserIgnoreList = &UserIgnoreLst; if (!allSameType(Roots)) return; @@ -8950,6 +9037,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { deleteTree(); + assert(TreeEntryToStridedPtrInfoMap.empty() && + "TreeEntryToStridedPtrInfoMap is not cleared"); if (!allSameType(Roots)) return; buildTreeRec(Roots, 0, EdgeInfo()); @@ -10621,7 +10710,10 @@ class InstructionsCompatibilityAnalysis { /// Checks if the opcode is supported as the main opcode for copyable /// elements. static bool isSupportedOpcode(const unsigned Opcode) { - return Opcode == Instruction::Add || Opcode == Instruction::LShr; + return Opcode == Instruction::Add || Opcode == Instruction::LShr || + Opcode == Instruction::Shl || Opcode == Instruction::SDiv || + Opcode == Instruction::UDiv || Opcode == Instruction::And || + Opcode == Instruction::Or || Opcode == Instruction::Xor; } /// Identifies the best candidate value, which represents main opcode @@ -10631,7 +10723,9 @@ class InstructionsCompatibilityAnalysis { void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) { BasicBlock *Parent = nullptr; // Checks if the instruction has supported opcode. - auto IsSupportedInstruction = [&](Instruction *I) { + auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) { + if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I))) + return false; return I && isSupportedOpcode(I->getOpcode()) && (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I)); }; @@ -10639,10 +10733,13 @@ class InstructionsCompatibilityAnalysis { // will be unable to schedule anyway. SmallDenseSet<Value *, 8> Operands; SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates; + bool AnyUndef = false; for (Value *V : VL) { auto *I = dyn_cast<Instruction>(V); - if (!I) + if (!I) { + AnyUndef |= isa<UndefValue>(V); continue; + } if (!DT.isReachableFromEntry(I->getParent())) continue; if (Candidates.empty()) { @@ -10677,7 +10774,7 @@ class InstructionsCompatibilityAnalysis { if (P.second.size() < BestOpcodeNum) continue; for (Instruction *I : P.second) { - if (IsSupportedInstruction(I) && !Operands.contains(I)) { + if (IsSupportedInstruction(I, AnyUndef) && !Operands.contains(I)) { MainOp = I; BestOpcodeNum = P.second.size(); break; @@ -10938,6 +11035,12 @@ public: switch (MainOpcode) { case Instruction::Add: case Instruction::LShr: + case Instruction::Shl: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: @@ -17582,7 +17685,9 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { } if (IsPHI || (!E->isGather() && E->State != TreeEntry::SplitVectorize && - E->doesNotNeedToSchedule()) || + (E->doesNotNeedToSchedule() || + (E->hasCopyableElements() && !E->isCopyableElement(LastInst) && + isUsedOutsideBlock(LastInst)))) || (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load)) { @@ -19410,7 +19515,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } assert(getNumElements(Cond->getType()) == TrueNumElements && "Cannot vectorize Instruction::Select"); - Value *V = Builder.CreateSelect(Cond, True, False); + Value *V = + Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -20030,7 +20136,9 @@ Value *BoUpSLP::vectorizeTree( // The is because source vector that supposed to feed this gather node was // inserted at the end of the block [after stab instruction]. So we need // to adjust insertion point again to the end of block. - if (isa<PHINode>(UserI)) { + if (isa<PHINode>(UserI) || + (TE->UserTreeIndex.UserTE->hasState() && + TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) { // Insert before all users. Instruction *InsertPt = PrevVec->getParent()->getTerminator(); for (User *U : PrevVec->users()) { @@ -22007,6 +22115,8 @@ bool BoUpSLP::collectValuesToDemote( return all_of(E.Scalars, [&](Value *V) { if (isa<PoisonValue>(V)) return true; + if (E.isCopyableElement(V)) + return true; auto *I = cast<Instruction>(V); KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); return AmtKnownBits.getMaxValue().ult(BitWidth); @@ -22063,8 +22173,10 @@ bool BoUpSLP::collectValuesToDemote( auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!"); return all_of(E.Scalars, [&](Value *V) { - auto *I = cast<Instruction>(V); APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth); + if (E.hasCopyableElements() && E.isCopyableElement(V)) + return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)); + auto *I = cast<Instruction>(V); return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) && MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)); }); @@ -23524,18 +23636,19 @@ class HorizontalReduction { switch (Kind) { case RecurKind::Or: { if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy)) - return Builder.CreateSelect( + return Builder.CreateSelectWithUnknownProfile( LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)), - RHS, Name); + RHS, DEBUG_TYPE, Name); unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); } case RecurKind::And: { if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy)) - return Builder.CreateSelect( + return Builder.CreateSelectWithUnknownProfile( LHS, RHS, - ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name); + ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), + DEBUG_TYPE, Name); unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); @@ -23556,7 +23669,8 @@ class HorizontalReduction { if (UseSelect) { CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind); Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); + return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE, + Name); } [[fallthrough]]; case RecurKind::FMax: |
