diff options
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp | 49 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp | 47 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 205 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.cpp | 16 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 68 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 16 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 56 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 147 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 7 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 43 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 71 |
14 files changed, 620 insertions, 127 deletions
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 3986359b6a5a..4df18c824927 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -583,10 +583,8 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { // RemoveDIs: there's no bitcode representation of the DbgVariableRecord // debug-info, convert to dbg.values before writing out. - bool ConvertToOldDbgFormatForWrite = - M.IsNewDbgInfoFormat && !WriteNewDbgInfoFormatToBitcode; - if (ConvertToOldDbgFormatForWrite) - M.convertFromNewDbgValues(); + ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat && + WriteNewDbgInfoFormatToBitcode); bool Changed = writeThinLTOBitcode( OS, ThinLinkOS, @@ -595,8 +593,5 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { }, M, &AM.getResult<ModuleSummaryIndexAnalysis>(M)); - if (ConvertToOldDbgFormatForWrite) - M.convertToNewDbgValues(); - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index d0d349c891a3..ad1cd9c1f6bf 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -182,18 +182,11 @@ static cl::opt<bool> ClWithTls( "platforms that support this"), cl::Hidden, cl::init(true)); -static cl::opt<bool> - CSelectiveInstrumentation("hwasan-selective-instrumentation", - cl::desc("Use selective instrumentation"), - cl::Hidden, cl::init(false)); - -static cl::opt<int> ClHotPercentileCutoff( - "hwasan-percentile-cutoff-hot", cl::init(0), - cl::desc("Alternative hot percentile cuttoff." - "By default `-profile-summary-cutoff-hot` is used.")); +static cl::opt<int> ClHotPercentileCutoff("hwasan-percentile-cutoff-hot", + cl::desc("Hot percentile cuttoff.")); static cl::opt<float> - ClRandomSkipRate("hwasan-random-skip-rate", cl::init(0), + ClRandomSkipRate("hwasan-random-skip-rate", cl::desc("Probability value in the range [0.0, 1.0] " "to skip instrumentation of a function.")); @@ -317,7 +310,7 @@ private: }; bool selectiveInstrumentationShouldSkip(Function &F, - FunctionAnalysisManager &FAM); + FunctionAnalysisManager &FAM) const; void initializeModule(); void createHwasanCtorComdat(); @@ -1500,28 +1493,22 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo, } bool HWAddressSanitizer::selectiveInstrumentationShouldSkip( - Function &F, FunctionAnalysisManager &FAM) { + Function &F, FunctionAnalysisManager &FAM) const { if (ClRandomSkipRate.getNumOccurrences()) { std::bernoulli_distribution D(ClRandomSkipRate); - if (D(*Rng)) - return true; - } else { - auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F); - ProfileSummaryInfo *PSI = - MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); - if (PSI && PSI->hasProfileSummary()) { - auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); - if ((ClHotPercentileCutoff.getNumOccurrences() && - ClHotPercentileCutoff >= 0) - ? PSI->isFunctionHotInCallGraphNthPercentile( - ClHotPercentileCutoff, &F, BFI) - : PSI->isFunctionHotInCallGraph(&F, BFI)) - return true; - } else { - ++NumNoProfileSummaryFuncs; - } + return (D(*Rng)); } - return false; + if (!ClHotPercentileCutoff.getNumOccurrences()) + return false; + auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F); + ProfileSummaryInfo *PSI = + MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); + if (!PSI || !PSI->hasProfileSummary()) { + ++NumNoProfileSummaryFuncs; + return false; + } + return PSI->isFunctionHotInCallGraphNthPercentile( + ClHotPercentileCutoff, &F, FAM.getResult<BlockFrequencyAnalysis>(F)); } void HWAddressSanitizer::sanitizeFunction(Function &F, @@ -1537,7 +1524,7 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, NumTotalFuncs++; - if (CSelectiveInstrumentation && selectiveInstrumentationShouldSkip(F, FAM)) + if (selectiveInstrumentationShouldSkip(F, FAM)) return; NumInstrumentedFuncs++; diff --git a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp index d87f7482a21d..6adc29f8572b 100644 --- a/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/RemoveTrapsPass.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" @@ -22,13 +23,11 @@ using namespace llvm; #define DEBUG_TYPE "remove-traps" -static cl::opt<int> HotPercentileCutoff( - "remove-traps-percentile-cutoff-hot", cl::init(0), - cl::desc("Alternative hot percentile cuttoff. By default " - "`-profile-summary-cutoff-hot` is used.")); +static cl::opt<int> HotPercentileCutoff("remove-traps-percentile-cutoff-hot", + cl::desc("Hot percentile cuttoff.")); static cl::opt<float> - RandomRate("remove-traps-random-rate", cl::init(0.0), + RandomRate("remove-traps-random-rate", cl::desc("Probability value in the range [0.0, 1.0] of " "unconditional pseudo-random checks removal.")); @@ -37,9 +36,11 @@ STATISTIC(NumChecksRemoved, "Number of removed checks"); static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI, const ProfileSummaryInfo *PSI) { - SmallVector<IntrinsicInst *, 16> Remove; + SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue; std::unique_ptr<RandomNumberGenerator> Rng; + // TODO: + // https://github.com/llvm/llvm-project/pull/84858#discussion_r1520603139 auto ShouldRemove = [&](bool IsHot) { if (!RandomRate.getNumOccurrences()) return IsHot; @@ -56,26 +57,23 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI, continue; auto ID = II->getIntrinsicID(); switch (ID) { - case Intrinsic::ubsantrap: { + case Intrinsic::allow_ubsan_check: + case Intrinsic::allow_runtime_check: { ++NumChecksTotal; bool IsHot = false; if (PSI) { - uint64_t Count = 0; - for (const auto *PR : predecessors(&BB)) - Count += BFI.getBlockProfileCount(PR).value_or(0); - - IsHot = - HotPercentileCutoff.getNumOccurrences() - ? (HotPercentileCutoff > 0 && - PSI->isHotCountNthPercentile(HotPercentileCutoff, Count)) - : PSI->isHotCount(Count); + uint64_t Count = BFI.getBlockProfileCount(&BB).value_or(0); + IsHot = PSI->isHotCountNthPercentile(HotPercentileCutoff, Count); } - if (ShouldRemove(IsHot)) { - Remove.push_back(II); + bool ToRemove = ShouldRemove(IsHot); + ReplaceWithValue.push_back({ + II, + ToRemove, + }); + if (ToRemove) ++NumChecksRemoved; - } break; } default: @@ -84,10 +82,12 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI, } } - for (IntrinsicInst *I : Remove) + for (auto [I, V] : ReplaceWithValue) { + I->replaceAllUsesWith(ConstantInt::getBool(I->getType(), !V)); I->eraseFromParent(); + } - return !Remove.empty(); + return !ReplaceWithValue.empty(); } PreservedAnalyses RemoveTrapsPass::run(Function &F, @@ -102,3 +102,8 @@ PreservedAnalyses RemoveTrapsPass::run(Function &F, return removeUbsanTraps(F, BFI, PSI) ? PreservedAnalyses::none() : PreservedAnalyses::all(); } + +bool RemoveTrapsPass::IsRequested() { + return RandomRate.getNumOccurrences() || + HotPercentileCutoff.getNumOccurrences(); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0834865173b2..cb0fd06554e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -124,6 +124,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/IR/VectorBuilder.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), - clEnumValN( - TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, - "data-and-control-without-rt-check", - "Similar to data-and-control, but remove the runtime check"))); + clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, + "data-and-control-without-rt-check", + "Similar to data-and-control, but remove the runtime check"), + clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", + "Use predicated EVL instructions for tail folding. If EVL " + "is unsupported, fallback to data-without-lane-mask."))); static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -1505,29 +1508,62 @@ public: /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { - return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first - : ChosenTailFoldingStyle.second; + if (!ChosenTailFoldingStyle) + return TailFoldingStyle::None; + return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first + : ChosenTailFoldingStyle->second; } /// Selects and saves TailFoldingStyle for 2 options - if IV update may /// overflow or not. - void setTailFoldingStyles() { - assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None && - ChosenTailFoldingStyle.second == TailFoldingStyle::None && - "Tail folding must not be selected yet."); - if (!Legal->prepareToFoldTailByMasking()) + /// \param IsScalableVF true if scalable vector factors enabled. + /// \param UserIC User specific interleave count. + void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { + assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); + if (!Legal->prepareToFoldTailByMasking()) { + ChosenTailFoldingStyle = + std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); return; + } - if (ForceTailFoldingStyle.getNumOccurrences()) { - ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second = - ForceTailFoldingStyle; + if (!ForceTailFoldingStyle.getNumOccurrences()) { + ChosenTailFoldingStyle = std::make_pair( + TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), + TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); return; } - ChosenTailFoldingStyle.first = - TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true); - ChosenTailFoldingStyle.second = - TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false); + // Set styles when forced. + ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), + ForceTailFoldingStyle.getValue()); + if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) + return; + // Override forced styles if needed. + // FIXME: use actual opcode/data type for analysis here. + // FIXME: Investigate opportunity for fixed vector factor. + bool EVLIsLegal = + IsScalableVF && UserIC <= 1 && + TTI.hasActiveVectorLength(0, nullptr, Align()) && + !EnableVPlanNativePath && + // FIXME: implement support for max safe dependency distance. + Legal->isSafeForAnyVectorWidth() && + // FIXME: remove this once reductions are supported. + Legal->getReductionVars().empty(); + if (!EVLIsLegal) { + // If for some reason EVL mode is unsupported, fallback to + // DataWithoutLaneMask to try to vectorize the loop with folded tail + // in a generic way. + ChosenTailFoldingStyle = + std::make_pair(TailFoldingStyle::DataWithoutLaneMask, + TailFoldingStyle::DataWithoutLaneMask); + LLVM_DEBUG( + dbgs() + << "LV: Preference for VP intrinsics indicated. Will " + "not try to generate VP Intrinsics " + << (UserIC > 1 + ? "since interleave count specified is greater than 1.\n" + : "due to non-interleaving reasons.\n")); + } } /// Returns true if all loop blocks should be masked to fold tail loop. @@ -1544,6 +1580,18 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics with explicit vector length support should + /// be generated in the tail folded loop. + bool foldTailWithEVL() const { + return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && + // FIXME: remove this once vp_reverse is supported. + none_of( + WideningDecisions, + [](const std::pair<std::pair<Instruction *, ElementCount>, + std::pair<InstWidening, InstructionCost>> + &Data) { return Data.second.first == CM_Widen_Reverse; }); + } + /// Returns true if the Phi is part of an inloop reduction. bool isInLoopReduction(PHINode *Phi) const { return InLoopReductions.contains(Phi); @@ -1688,8 +1736,8 @@ private: /// Control finally chosen tail folding style. The first element is used if /// the IV update may overflow, the second element - if it does not. - std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle = - std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); + std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> + ChosenTailFoldingStyle; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - setTailFoldingStyles(); - if (foldTailByMasking()) + setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); + if (foldTailByMasking()) { + if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { + LLVM_DEBUG( + dbgs() + << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " + "try to generate VP Intrinsics with scalable vector " + "factors only.\n"); + // Tail folded loop using VP intrinsics restricts the VF to be scalable + // for now. + // TODO: extend it for fixed vectors, if required. + assert(MaxFactors.ScalableVF.isScalable() && + "Expected scalable vector factor."); + + MaxFactors.FixedVF = ElementCount::getFixed(1); + } return MaxFactors; + } // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. @@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if EVL is preferred and no User IC is specified. + if (foldTailWithEVL()) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " + "Unroll factor forced to be 1.\n"); + return 1; + } + // We used the distance for the interleave count. if (!Legal->isSafeForAnyVectorWidth()) return 1; @@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::truncateToMinimalBitwidths( *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); VPlanTransforms::optimize(*Plan, *PSE.getSE()); + // TODO: try to put it close to addActiveLaneMask(). + if (CM.foldTailWithEVL()) + VPlanTransforms::addExplicitVectorLength(*Plan); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } @@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); Value *Step = State.get(getStepValue(), VPIteration(0, 0)); - Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); + Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0)); Value *DerivedIV = emitTransformedIndex( State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, Kind, cast_if_present<BinaryOperator>(FPBinOp)); @@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) { State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); } +/// Creates either vp_store or vp_scatter intrinsics calls to represent +/// predicated store/scatter. +static Instruction * +lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, + Value *StoredVal, bool IsScatter, Value *Mask, + Value *EVL, const Align &Alignment) { + CallInst *Call; + if (IsScatter) { + Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), + Intrinsic::vp_scatter, + {StoredVal, Addr, Mask, EVL}); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + Call = cast<CallInst>(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); + } + Call->addParamAttr( + 1, Attribute::getWithAlignment(Call->getContext(), Alignment)); + return Call; +} + +/// Creates either vp_load or vp_gather intrinsics calls to represent +/// predicated load/gather. +static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, + VectorType *DataTy, + Value *Addr, bool IsGather, + Value *Mask, Value *EVL, + const Align &Alignment) { + CallInst *Call; + if (IsGather) { + Call = + Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, + nullptr, "wide.masked.gather"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + Call = cast<CallInst>(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); + } + Call->addParamAttr( + 0, Attribute::getWithAlignment(Call->getContext(), Alignment)); + return Call; +} + void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; @@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < State.UF; ++Part) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); - if (CreateGatherScatter) { + // TODO: split this into several classes for better design. + if (State.EVL) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + assert(cast<VPInstruction>(State.EVL)->getOpcode() == + VPInstruction::ExplicitVectorLength && + "EVL must be VPInstruction::ExplicitVectorLength."); + Value *EVL = State.get(State.EVL, VPIteration(0, 0)); + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL + // is created only if TTI prefers predicated vectorization, thus if EVL + // is not nullptr it also implies preference for predicated + // vectorization. + // FIXME: Support reverse store after vp_reverse is added. + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + NewSI = lowerStoreUsingVectorIntrinsics( + Builder, State.get(getAddr(), Part, !CreateGatherScatter), + StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment); + } else if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, @@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { State.setDebugLocFrom(getDebugLoc()); for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; - if (CreateGatherScatter) { + // TODO: split this into several classes for better design. + if (State.EVL) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + assert(cast<VPInstruction>(State.EVL)->getOpcode() == + VPInstruction::ExplicitVectorLength && + "EVL must be VPInstruction::ExplicitVectorLength."); + Value *EVL = State.get(State.EVL, VPIteration(0, 0)); + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL + // is created only if TTI prefers predicated vectorization, thus if EVL + // is not nullptr it also implies preference for predicated + // vectorization. + // FIXME: Support reverse loading after vp_reverse is added. + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + NewLI = lowerLoadUsingVectorIntrinsics( + Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter), + CreateGatherScatter, MaskPart, EVL, Alignment); + } else if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 99769540f780..bdd26acfd2f8 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1973,7 +1973,7 @@ public: assert(isa<Instruction>(VL[0]) && "Expected instruction"); unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands(); constexpr unsigned IntrinsicNumOperands = 2; - if (auto *CI = dyn_cast<IntrinsicInst>(VL[0])) + if (isa<IntrinsicInst>(VL[0])) NumOperands = IntrinsicNumOperands; OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); @@ -14141,6 +14141,16 @@ bool BoUpSLP::collectValuesToDemote( })) return FinalAnalysis(); + if (!all_of(I->users(), + [=](User *U) { + return getTreeEntry(U) || + (UserIgnoreList && UserIgnoreList->contains(U)) || + (U->getType()->isSized() && + DL->getTypeSizeInBits(U->getType()) <= BitWidth); + }) && + !IsPotentiallyTruncated(I, BitWidth)) + return false; + unsigned Start = 0; unsigned End = I->getNumOperands(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index f0b7008992d7..8ebd75da3465 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -871,13 +871,15 @@ void VPlan::execute(VPTransformState *State) { // only a single part is generated, which provides the last part from the // previous iteration. For non-ordered reductions all UF parts are // generated. - bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) || - isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) || - (isa<VPReductionPHIRecipe>(PhiR) && - cast<VPReductionPHIRecipe>(PhiR)->isOrdered()); - bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) || - (isa<VPReductionPHIRecipe>(PhiR) && - cast<VPReductionPHIRecipe>(PhiR)->isInLoop()); + bool SinglePartNeeded = + isa<VPCanonicalIVPHIRecipe>(PhiR) || + isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) || + (isa<VPReductionPHIRecipe>(PhiR) && + cast<VPReductionPHIRecipe>(PhiR)->isOrdered()); + bool NeedsScalar = + isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) || + (isa<VPReductionPHIRecipe>(PhiR) && + cast<VPReductionPHIRecipe>(PhiR)->isInLoop()); unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 813ebda29ffd..77577b516ae2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -242,6 +242,15 @@ struct VPTransformState { ElementCount VF; unsigned UF; + /// If EVL (Explicit Vector Length) is not nullptr, then EVL must be a valid + /// value set during plan transformation, possibly a default value = whole + /// vector register length. EVL is created only if TTI prefers predicated + /// vectorization, thus if EVL is not nullptr it also implies preference for + /// predicated vectorization. + /// TODO: this is a temporarily solution, the EVL must be explicitly used by + /// the recipes and must be removed here. + VPValue *EVL = nullptr; + /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. @@ -1159,6 +1168,7 @@ public: SLPLoad, SLPStore, ActiveLaneMask, + ExplicitVectorLength, CalculateTripCountMinusVF, // Increment the canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, @@ -2489,6 +2499,45 @@ public: #endif }; +/// A recipe for generating the phi node for the current index of elements, +/// adjusted in accordance with EVL value. It starts at the start value of the +/// canonical induction and gets incremented by EVL in each iteration of the +/// vector loop. +class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe { +public: + VPEVLBasedIVPHIRecipe(VPValue *StartIV, DebugLoc DL) + : VPHeaderPHIRecipe(VPDef::VPEVLBasedIVPHISC, nullptr, StartIV, DL) {} + + ~VPEVLBasedIVPHIRecipe() override = default; + + VPEVLBasedIVPHIRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPEVLBasedIVPHISC) + + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC; + } + + /// Generate phi for handling IV based on EVL over iterations correctly. + /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe. + void execute(VPTransformState &State) override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe { public: @@ -2522,8 +2571,8 @@ public: } }; -/// A recipe for converting the canonical IV value to the corresponding value of -/// an IV with different start and step values, using Start + CanonicalIV * +/// A recipe for converting the input value \p IV value to the corresponding +/// value of an IV with different start and step values, using Start + IV * /// Step. class VPDerivedIVRecipe : public VPSingleDefRecipe { /// Kind of the induction. @@ -2541,16 +2590,16 @@ public: Start, CanonicalIV, Step) {} VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind, - const FPMathOperator *FPBinOp, VPValue *Start, - VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step) - : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}), - Kind(Kind), FPBinOp(FPBinOp) {} + const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV, + VPValue *Step) + : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind), + FPBinOp(FPBinOp) {} ~VPDerivedIVRecipe() override = default; VPRecipeBase *clone() override { - return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), - getCanonicalIV(), getStepValue()); + return new VPDerivedIVRecipe(Kind, FPBinOp, getStartValue(), getOperand(1), + getStepValue()); } VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC) @@ -2570,9 +2619,6 @@ public: } VPValue *getStartValue() const { return getOperand(0); } - VPCanonicalIVPHIRecipe *getCanonicalIV() const { - return cast<VPCanonicalIVPHIRecipe>(getOperand(1)); - } VPValue *getStepValue() const { return getOperand(2); } /// Returns true if the recipe only uses the first lane of operand \p Op. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 04e30312dc23..c8ae2ee5a30f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -216,14 +216,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { Type *ResultTy = TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe()) .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe, - VPReductionPHIRecipe, VPWidenPointerInductionRecipe>( - [this](const auto *R) { - // Handle header phi recipes, except VPWienIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + VPReductionPHIRecipe, VPWidenPointerInductionRecipe, + VPEVLBasedIVPHIRecipe>([this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>( [](const auto *R) { return R->getScalarType(); }) .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 124ae3108d8a..1be0287ce7c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -286,6 +286,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::ComputeReductionResult: case VPInstruction::PtrAdd: + case VPInstruction::ExplicitVectorLength: return true; default: return false; @@ -386,6 +387,33 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { Value *Zero = ConstantInt::get(ScalarTC->getType(), 0); return Builder.CreateSelect(Cmp, Sub, Zero); } + case VPInstruction::ExplicitVectorLength: { + // Compute EVL + auto GetEVL = [=](VPTransformState &State, Value *AVL) { + assert(AVL->getType()->isIntegerTy() && + "Requested vector length should be an integer."); + + // TODO: Add support for MaxSafeDist for correct loop emission. + assert(State.VF.isScalable() && "Expected scalable vector factor."); + Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); + + Value *EVL = State.Builder.CreateIntrinsic( + State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, + {AVL, VFArg, State.Builder.getTrue()}); + return EVL; + }; + // TODO: Restructure this code with an explicit remainder loop, vsetvli can + // be outside of the main loop. + assert(Part == 0 && "No unrolling expected for predicated vectorization."); + // Compute VTC - IV as the AVL (requested vector length). + Value *Index = State.get(getOperand(0), VPIteration(0, 0)); + Value *TripCount = State.get(getOperand(1), VPIteration(0, 0)); + Value *AVL = State.Builder.CreateSub(TripCount, Index); + Value *EVL = GetEVL(State, AVL); + assert(!State.EVL && "multiple EVL recipes"); + State.EVL = this; + return EVL; + } case VPInstruction::CanonicalIVIncrementForPart: { auto *IV = State.get(getOperand(0), VPIteration(0, 0)); if (Part == 0) @@ -592,6 +620,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: + case VPInstruction::ExplicitVectorLength: case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::BranchOnCount: @@ -628,6 +657,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; + case VPInstruction::ExplicitVectorLength: + O << "EXPLICIT-VECTOR-LENGTH"; + break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; @@ -1184,7 +1216,7 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "= DERIVED-IV "; getStartValue()->printAsOperand(O, SlotTracker); O << " + "; - getCanonicalIV()->printAsOperand(O, SlotTracker); + getOperand(1)->printAsOperand(O, SlotTracker); O << " * "; getStepValue()->printAsOperand(O, SlotTracker); } @@ -1974,3 +2006,25 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization."); + Value *Start = State.get(getOperand(0), VPIteration(0, 0)); + PHINode *EntryPart = + State.Builder.CreatePHI(Start->getType(), 2, "evl.based.iv"); + EntryPart->addIncoming(Start, VectorPH); + EntryPart->setDebugLoc(getDebugLoc()); + State.set(this, EntryPart, 0, /*IsScalar=*/true); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3753060cd6ec..1256e4d8fda5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -641,6 +641,25 @@ static void removeRedundantExpandSCEVRecipes(VPlan &Plan) { } } +static void recursivelyDeleteDeadRecipes(VPValue *V) { + SmallVector<VPValue *> WorkList; + SmallPtrSet<VPValue *, 8> Seen; + WorkList.push_back(V); + + while (!WorkList.empty()) { + VPValue *Cur = WorkList.pop_back_val(); + if (!Seen.insert(Cur).second) + continue; + VPRecipeBase *R = Cur->getDefiningRecipe(); + if (!R) + continue; + if (!isDeadRecipe(*R)) + continue; + WorkList.append(R->op_begin(), R->op_end()); + R->eraseFromParent(); + } +} + void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { @@ -674,7 +693,11 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}); + + SmallVector<VPValue *> PossiblyDead(Term->operands()); Term->eraseFromParent(); + for (VPValue *Op : PossiblyDead) + recursivelyDeleteDeadRecipes(Op); ExitingVPBB->appendRecipe(BOC); Plan.setVF(BestVF); Plan.setUF(BestUF); @@ -1186,6 +1209,45 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( return LaneMaskPhi; } +/// Replaces (ICMP_ULE, WideCanonicalIV, backedge-taken-count) pattern using +/// the given \p Idiom. +static void +replaceHeaderPredicateWith(VPlan &Plan, VPValue &Idiom, + function_ref<bool(VPUser &, unsigned)> Cond = {}) { + auto *FoundWidenCanonicalIVUser = + find_if(Plan.getCanonicalIV()->users(), + [](VPUser *U) { return isa<VPWidenCanonicalIVRecipe>(U); }); + if (FoundWidenCanonicalIVUser == Plan.getCanonicalIV()->users().end()) + return; + auto *WideCanonicalIV = + cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser); + // Walk users of WideCanonicalIV and replace all compares of the form + // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with + // the given idiom VPValue. + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { + auto *CompareToReplace = dyn_cast<VPInstruction>(U); + if (!CompareToReplace || + CompareToReplace->getOpcode() != Instruction::ICmp || + CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || + CompareToReplace->getOperand(1) != BTC) + continue; + + assert(CompareToReplace->getOperand(0) == WideCanonicalIV && + "WidenCanonicalIV must be the first operand of the compare"); + if (Cond) { + CompareToReplace->replaceUsesWithIf(&Idiom, Cond); + if (!CompareToReplace->getNumUsers()) + CompareToReplace->eraseFromParent(); + } else { + CompareToReplace->replaceAllUsesWith(&Idiom); + CompareToReplace->eraseFromParent(); + } + } + if (!WideCanonicalIV->getNumUsers()) + WideCanonicalIV->eraseFromParent(); +} + void VPlanTransforms::addActiveLaneMask( VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck) { @@ -1215,20 +1277,77 @@ void VPlanTransforms::addActiveLaneMask( // Walk users of WideCanonicalIV and replace all compares of the form // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an // active-lane-mask. - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - for (VPUser *U : SmallVector<VPUser *>(WideCanonicalIV->users())) { - auto *CompareToReplace = dyn_cast<VPInstruction>(U); - if (!CompareToReplace || - CompareToReplace->getOpcode() != Instruction::ICmp || - CompareToReplace->getPredicate() != CmpInst::ICMP_ULE || - CompareToReplace->getOperand(1) != BTC) - continue; + replaceHeaderPredicateWith(Plan, *LaneMask); +} - assert(CompareToReplace->getOperand(0) == WideCanonicalIV && - "WidenCanonicalIV must be the first operand of the compare"); - CompareToReplace->replaceAllUsesWith(LaneMask); - CompareToReplace->eraseFromParent(); +/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and +/// replaces all uses except the canonical IV increment of +/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe +/// is used only for loop iterations counting after this transformation. +/// +/// The function uses the following definitions: +/// %StartV is the canonical induction start value. +/// +/// The function adds the following recipes: +/// +/// vector.ph: +/// ... +/// +/// vector.body: +/// ... +/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ], +/// [ %NextEVLIV, %vector.body ] +/// %VPEVL = EXPLICIT-VECTOR-LENGTH %EVLPhi, original TC +/// ... +/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi +/// ... +/// +void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { + VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + auto *CanonicalIVPHI = Plan.getCanonicalIV(); + VPValue *StartV = CanonicalIVPHI->getStartValue(); + + // TODO: revisit this and try to remove the mask operand. + // Walk VPWidenMemoryInstructionRecipe users of WideCanonicalIV and replace + // all compares of the form (ICMP_ULE, WideCanonicalIV, backedge-taken-count), + // used as mask in VPWidenMemoryInstructionRecipe, with an all-true-mask. + Value *TrueMask = + ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext()); + VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask); + replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) { + return isa<VPWidenMemoryInstructionRecipe>(U); + }); + // Now create the ExplicitVectorLengthPhi recipe in the main loop. + auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); + EVLPhi->insertAfter(CanonicalIVPHI); + auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, + {EVLPhi, Plan.getTripCount()}); + VPEVL->insertBefore(*Header, Header->getFirstNonPhi()); + + auto *CanonicalIVIncrement = + cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue()); + VPSingleDefRecipe *OpVPEVL = VPEVL; + if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); + IVSize != 32) { + OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc + : Instruction::ZExt, + OpVPEVL, CanonicalIVPHI->getScalarType()); + OpVPEVL->insertBefore(CanonicalIVIncrement); } + auto *NextEVLIV = + new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, + {CanonicalIVIncrement->hasNoUnsignedWrap(), + CanonicalIVIncrement->hasNoSignedWrap()}, + CanonicalIVIncrement->getDebugLoc(), "index.evl.next"); + NextEVLIV->insertBefore(CanonicalIVIncrement); + EVLPhi->addOperand(NextEVLIV); + + // Replace all uses of VPCanonicalIVPHIRecipe by + // VPEVLBasedIVPHIRecipe except for the canonical IV increment. + CanonicalIVPHI->replaceAllUsesWith(EVLPhi); + CanonicalIVIncrement->setOperand(0, CanonicalIVPHI); + // TODO: support unroll factor > 1. + Plan.setUF(1); } void VPlanTransforms::dropPoisonGeneratingRecipes( @@ -1254,9 +1373,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // handled. if (isa<VPWidenMemoryInstructionRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) || - isa<VPScalarIVStepsRecipe>(CurRec) || - isa<VPCanonicalIVPHIRecipe>(CurRec) || - isa<VPActiveLaneMaskPHIRecipe>(CurRec)) + isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec)) continue; // This recipe contributes to the address computation of a widen diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index ff83c3f083b0..0cbc70713d9c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -98,6 +98,13 @@ struct VPlanTransforms { /// VPlan directly. static void dropPoisonGeneratingRecipes( VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication); + + /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and + /// replaces all uses except the canonical IV increment of + /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. + /// VPCanonicalIVPHIRecipe is only used to control the loop after + /// this transformation. + static void addExplicitVectorLength(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 1d2c17e91b7a..8b221d30e525 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -368,6 +368,7 @@ public: // VPHeaderPHIRecipe need to be kept together. VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, + VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 7ebdb914fb85..12d37fa711db 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -92,7 +92,50 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB, for (const VPRecipeBase &R : *VPBB) RecipeNumbering[&R] = Cnt++; + // Set of recipe types along with VPInstruction Opcodes of all EVL-related + // recipes that must appear at most once in the header block. + DenseSet<unsigned> EVLFound; + const VPRecipeBase *VPWidenMemRecipe = nullptr; + const VPlan *Plan = VPBB->getPlan(); + bool IsHeader = Plan->getEntry()->getNumSuccessors() == 1 && + Plan->getVectorLoopRegion()->getEntry() == VPBB; + auto CheckEVLRecipiesInsts = [&](const VPRecipeBase *R) { + if (isa<VPEVLBasedIVPHIRecipe>(R)) { + if (!IsHeader) { + errs() << "EVL PHI recipe not in entry block!\n"; + return false; + } + if (!EVLFound.insert(VPDef::VPEVLBasedIVPHISC).second) { + errs() << "EVL PHI recipe inserted more than once!\n"; + return false; + } + return true; + } + if (const auto *RInst = dyn_cast<VPInstruction>(R); + RInst && RInst->getOpcode() == VPInstruction::ExplicitVectorLength) { + if (!IsHeader) { + errs() << "EVL instruction not in the header block!\n"; + return false; + } + if (!EVLFound.insert(RInst->getOpcode() + VPDef::VPLastPHISC).second) { + errs() << "EVL instruction inserted more than once!\n"; + return false; + } + if (VPWidenMemRecipe) { + errs() << "Use of EVL instruction by widen memory recipe before " + "definition!\n"; + return false; + } + return true; + } + if (isa<VPWidenMemoryInstructionRecipe>(R)) + VPWidenMemRecipe = R; + return true; + }; + for (const VPRecipeBase &R : *VPBB) { + if (!CheckEVLRecipiesInsts(&R)) + return false; for (const VPValue *V : R.definedValues()) { for (const VPUser *U : V->users()) { auto *UI = dyn_cast<VPRecipeBase>(U); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index af5e7c9bc385..3738220b4f81 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -112,6 +112,7 @@ private: bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); bool foldShuffleOfBinops(Instruction &I); + bool foldShuffleOfCastops(Instruction &I); bool foldShuffleFromReductions(Instruction &I); bool foldTruncFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); @@ -1432,6 +1433,75 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) { return true; } +/// Try to convert "shuffle (castop), (castop)" with a shared castop operand +/// into "castop (shuffle)". +bool VectorCombine::foldShuffleOfCastops(Instruction &I) { + Value *V0, *V1; + ArrayRef<int> Mask; + if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)), + m_Mask(Mask)))) + return false; + + auto *C0 = dyn_cast<CastInst>(V0); + auto *C1 = dyn_cast<CastInst>(V1); + if (!C0 || !C1) + return false; + + Instruction::CastOps Opcode = C0->getOpcode(); + if (Opcode == Instruction::BitCast || C0->getSrcTy() != C1->getSrcTy()) + return false; + + // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. + if (Opcode != C1->getOpcode()) { + if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) + Opcode = Instruction::SExt; + else + return false; + } + + auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType()); + auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy()); + auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy()); + if (!ShuffleDstTy || !CastDstTy || !CastSrcTy) + return false; + assert(CastDstTy->getElementCount() == CastSrcTy->getElementCount() && + "Unexpected src/dst element counts"); + + auto *NewShuffleDstTy = + FixedVectorType::get(CastSrcTy->getScalarType(), Mask.size()); + + // Try to replace a castop with a shuffle if the shuffle is not costly. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + InstructionCost OldCost = + TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, + TTI::CastContextHint::None, CostKind) + + TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, + TTI::CastContextHint::None, CostKind); + OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, + CastDstTy, Mask, CostKind); + + InstructionCost NewCost = TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, Mask, CostKind); + NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, + TTI::CastContextHint::None, CostKind); + if (NewCost > OldCost) + return false; + + Value *Shuf = + Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), Mask); + Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy); + + // Intersect flags from the old casts. + if (auto *NewInst = dyn_cast<Instruction>(Cast)) { + NewInst->copyIRFlags(C0); + NewInst->andIRFlags(C1); + } + + replaceValue(I, *Cast); + return true; +} + /// Given a commutative reduction, the order of the input lanes does not alter /// the results. We can use this to remove certain shuffles feeding the /// reduction, removing the need to shuffle at all. @@ -1986,6 +2056,7 @@ bool VectorCombine::run() { break; case Instruction::ShuffleVector: MadeChange |= foldShuffleOfBinops(I); + MadeChange |= foldShuffleOfCastops(I); MadeChange |= foldSelectShuffle(I); break; case Instruction::BitCast: |
