diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 456 |
1 files changed, 260 insertions, 196 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0b809c2b34df..3e3f5adf73a0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -217,7 +217,7 @@ static VPBasicBlock *getPredicatedThenBlock(VPRegionBlock *R) { // is connected to a successor replicate region with the same predicate by a // single, empty VPBasicBlock. static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { - SetVector<VPRegionBlock *> DeletedRegions; + SmallPtrSet<VPRegionBlock *, 4> TransformedRegions; // Collect replicate regions followed by an empty block, followed by another // replicate region with matching masks to process front. This is to avoid @@ -248,7 +248,7 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { // Move recipes from Region1 to its successor region, if both are triangles. for (VPRegionBlock *Region1 : WorkList) { - if (DeletedRegions.contains(Region1)) + if (TransformedRegions.contains(Region1)) continue; auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor()); auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor()); @@ -294,12 +294,10 @@ static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan) { VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock); } VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock); - DeletedRegions.insert(Region1); + TransformedRegions.insert(Region1); } - for (VPRegionBlock *ToDelete : DeletedRegions) - delete ToDelete; - return !DeletedRegions.empty(); + return !TransformedRegions.empty(); } static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, @@ -310,7 +308,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, assert(Instr->getParent() && "Predicated instruction not in any basic block"); auto *BlockInMask = PredRecipe->getMask(); auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); + auto *Entry = + Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); // Replace predicated replicate recipe with a replicate recipe without a // mask but in the replicate region. @@ -318,7 +317,8 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PredRecipe->getUnderlyingInstr(), make_range(PredRecipe->op_begin(), std::prev(PredRecipe->op_end())), PredRecipe->isUniform()); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); + auto *Pred = + Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask); VPPredInstPHIRecipe *PHIRecipe = nullptr; if (PredRecipe->getNumUsers() != 0) { @@ -328,8 +328,10 @@ static VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe, PHIRecipe->setOperand(0, RecipeWithoutMask); } PredRecipe->eraseFromParent(); - auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true); + auto *Exiting = + Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); + VPRegionBlock *Region = + Plan.createVPRegionBlock(Entry, Exiting, RegionName, true); // Note: first set Entry as region entry and then connect successors starting // from it in order, to propagate the "parent" of each VPBasicBlock. @@ -396,7 +398,7 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) { VPBlockUtils::disconnectBlocks(VPBB, Succ); VPBlockUtils::connectBlocks(PredVPBB, Succ); } - delete VPBB; + // VPBB is now dead and will be cleaned up when the plan gets destroyed. } return !WorkList.empty(); } @@ -525,7 +527,8 @@ static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, - VPValue *StartV, VPValue *Step, VPBuilder &Builder) { + VPValue *StartV, VPValue *Step, DebugLoc DL, + VPBuilder &Builder) { VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); VPSingleDefRecipe *BaseIV = Builder.createDerivedIV( @@ -540,7 +543,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() && "Not truncating."); assert(ResultTy->isIntegerTy() && "Truncation requires an integer type"); - BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy); + BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL); ResultTy = TruncTy; } @@ -554,26 +557,68 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, cast<VPBasicBlock>(HeaderVPBB->getSingleHierarchicalPredecessor()); VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(VecPreheader); - Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy); + Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL); } return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step); } +static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { + SetVector<VPUser *> Users(V->user_begin(), V->user_end()); + for (unsigned I = 0; I != Users.size(); ++I) { + VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]); + if (isa<VPHeaderPHIRecipe>(Cur)) + continue; + for (VPValue *V : Cur->definedValues()) + Users.insert(V->user_begin(), V->user_end()); + } + return Users.takeVector(); +} + /// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd /// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as /// VPWidenPointerInductionRecipe will generate vectors only. If some users /// require vectors while other require scalars, the scalar uses need to extract /// the scalars from the generated vectors (Note that this is different to how -/// int/fp inductions are handled). Also optimize VPWidenIntOrFpInductionRecipe, -/// if any of its users needs scalar values, by providing them scalar steps -/// built on the canonical scalar IV and update the original IV's users. This is -/// an optional optimization to reduce the needs of vector extracts. +/// int/fp inductions are handled). Legalize extract-from-ends using uniform +/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so +/// the correct end value is available. Also optimize +/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by +/// providing them scalar steps built on the canonical scalar IV and update the +/// original IV's users. This is an optional optimization to reduce the needs of +/// vector extracts. static void legalizeAndOptimizeInductions(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; SmallVector<VPRecipeBase *> ToRemove; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { + auto *PhiR = dyn_cast<VPHeaderPHIRecipe>(&Phi); + if (!PhiR) + break; + + // Check if any uniform VPReplicateRecipes using the phi recipe are used by + // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to + // ensure the final value is available. + // TODO: Remove once uniformity analysis is done on VPlan. + for (VPUser *U : collectUsersRecursively(PhiR)) { + auto *ExitIRI = dyn_cast<VPIRInstruction>(U); + VPValue *Op; + if (!ExitIRI || !match(ExitIRI->getOperand(0), + m_VPInstruction<VPInstruction::ExtractFromEnd>( + m_VPValue(Op), m_VPValue()))) + continue; + auto *RepR = dyn_cast<VPReplicateRecipe>(Op); + if (!RepR || !RepR->isUniform()) + continue; + assert(!RepR->isPredicated() && "RepR must not be predicated"); + Instruction *I = RepR->getUnderlyingInstr(); + auto *Clone = + new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false); + Clone->insertAfter(RepR); + RepR->replaceAllUsesWith(Clone); + } + // Replace wide pointer inductions which have only their scalars used by // PtrAdd(IndStart, ScalarIVSteps (0, Step)). if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) { @@ -586,7 +631,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { VPValue *StepV = PtrIV->getOperand(1); VPScalarIVStepsRecipe *Steps = createScalarIVSteps( Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr, - nullptr, StartV, StepV, Builder); + nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder); VPValue *PtrAdd = Builder.createPtrAdd(PtrIV->getStartValue(), Steps, PtrIV->getDebugLoc(), "next.gep"); @@ -610,7 +655,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) { Plan, ID.getKind(), ID.getInductionOpcode(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()), WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(), - Builder); + WideIV->getDebugLoc(), Builder); // Update scalar users of IV to use Step instead. if (!HasOnlyVectorVFs) @@ -660,13 +705,158 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { } } +/// Try to simplify recipe \p R. +static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { + using namespace llvm::VPlanPatternMatch; + + if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { + // Try to remove redundant blend recipes. + SmallPtrSet<VPValue *, 4> UniqueValues; + if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) + UniqueValues.insert(Blend->getIncomingValue(0)); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) + if (!match(Blend->getMask(I), m_False())) + UniqueValues.insert(Blend->getIncomingValue(I)); + + if (UniqueValues.size() == 1) { + Blend->replaceAllUsesWith(*UniqueValues.begin()); + Blend->eraseFromParent(); + return; + } + + if (Blend->isNormalized()) + return; + + // Normalize the blend so its first incoming value is used as the initial + // value with the others blended into it. + + unsigned StartIndex = 0; + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + // If a value's mask is used only by the blend then is can be deadcoded. + // TODO: Find the most expensive mask that can be deadcoded, or a mask + // that's used by multiple blends where it can be removed from them all. + VPValue *Mask = Blend->getMask(I); + if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { + StartIndex = I; + break; + } + } + + SmallVector<VPValue *, 4> OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + if (I == StartIndex) + continue; + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); + Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); + return; + } + + VPValue *A; + if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) { + VPValue *Trunc = R.getVPSingleValue(); + Type *TruncTy = TypeInfo.inferScalarType(Trunc); + Type *ATy = TypeInfo.inferScalarType(A); + if (TruncTy == ATy) { + Trunc->replaceAllUsesWith(A); + } else { + // Don't replace a scalarizing recipe with a widened cast. + if (isa<VPReplicateRecipe>(&R)) + return; + if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { + + unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) + ? Instruction::SExt + : Instruction::ZExt; + auto *VPC = + new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); + if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { + // UnderlyingExt has distinct return type, used to retain legacy cost. + VPC->setUnderlyingValue(UnderlyingExt); + } + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { + auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); + VPC->insertBefore(&R); + Trunc->replaceAllUsesWith(VPC); + } + } +#ifndef NDEBUG + // Verify that the cached type info is for both A and its users is still + // accurate by comparing it to freshly computed types. + VPTypeAnalysis TypeInfo2( + R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); + assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); + for (VPUser *U : A->users()) { + auto *R = cast<VPRecipeBase>(U); + for (VPValue *VPV : R->definedValues()) + assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); + } +#endif + } + + // Simplify (X && Y) || (X && !Y) -> X. + // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X + // && (Y || Z) and (X || !X) into true. This requires queuing newly created + // recipes to be visited during simplification. + VPValue *X, *Y, *X1, *Y1; + if (match(&R, + m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), + m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && + X == X1 && Y == Y1) { + R.getVPSingleValue()->replaceAllUsesWith(X); + R.eraseFromParent(); + return; + } + + if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) + return R.getVPSingleValue()->replaceAllUsesWith(A); + + if (match(&R, m_Not(m_Not(m_VPValue(A))))) + return R.getVPSingleValue()->replaceAllUsesWith(A); + + // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. + if ((match(&R, + m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) || + match(&R, + m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) && + TypeInfo.inferScalarType(R.getOperand(1)) == + TypeInfo.inferScalarType(R.getVPSingleValue())) + return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); +} + +/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all +/// un-typed live-ins in VPTypeAnalysis. +static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) { + ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( + Plan.getEntry()); + VPTypeAnalysis TypeInfo(CanonicalIVTy); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + simplifyRecipe(R, TypeInfo); + } + } +} + void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - VPBasicBlock *ExitingVPBB = - Plan.getVectorLoopRegion()->getExitingBasicBlock(); + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); auto *Term = &ExitingVPBB->back(); // Try to simplify the branch condition if TC <= VF * UF when preparing to // execute the plan for the main vector loop. We only do this if the @@ -690,16 +880,44 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) return; - LLVMContext &Ctx = SE.getContext(); - auto *BOC = new VPInstruction( - VPInstruction::BranchOnCond, - {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + // The vector loop region only executes once. If possible, completely remove + // the region, otherwise replace the terminator controlling the latch with + // (BranchOnCond true). + auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry()); + auto *CanIVTy = Plan.getCanonicalIV()->getScalarType(); + if (all_of( + Header->phis(), + IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) { + for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) { + auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR); + HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue()); + HeaderPhiR->eraseFromParent(); + } + + VPBlockBase *Preheader = VectorRegion->getSinglePredecessor(); + VPBlockBase *Exit = VectorRegion->getSingleSuccessor(); + VPBlockUtils::disconnectBlocks(Preheader, VectorRegion); + VPBlockUtils::disconnectBlocks(VectorRegion, Exit); + + for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry())) + B->setParent(nullptr); + + VPBlockUtils::connectBlocks(Preheader, Header); + VPBlockUtils::connectBlocks(ExitingVPBB, Exit); + simplifyRecipes(Plan, CanIVTy); + } else { + // The vector region contains header phis for which we cannot remove the + // loop region yet. + LLVMContext &Ctx = SE.getContext(); + auto *BOC = new VPInstruction( + VPInstruction::BranchOnCond, + {Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc()); + ExitingVPBB->appendRecipe(BOC); + } - SmallVector<VPValue *> PossiblyDead(Term->operands()); Term->eraseFromParent(); - for (VPValue *Op : PossiblyDead) - recursivelyDeleteDeadRecipes(Op); - ExitingVPBB->appendRecipe(BOC); + VPlanTransforms::removeDeadRecipes(Plan); + Plan.setVF(BestVF); Plan.setUF(BestUF); // TODO: Further simplifications are possible @@ -910,18 +1128,6 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, return true; } -static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) { - SetVector<VPUser *> Users(V->user_begin(), V->user_end()); - for (unsigned I = 0; I != Users.size(); ++I) { - VPRecipeBase *Cur = cast<VPRecipeBase>(Users[I]); - if (isa<VPHeaderPHIRecipe>(Cur)) - continue; - for (VPValue *V : Cur->definedValues()) - Users.insert(V->user_begin(), V->user_end()); - } - return Users.takeVector(); -} - void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { for (VPRecipeBase &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -940,138 +1146,6 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { } } -/// Try to simplify recipe \p R. -static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { - using namespace llvm::VPlanPatternMatch; - - if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) { - // Try to remove redundant blend recipes. - SmallPtrSet<VPValue *, 4> UniqueValues; - if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) - UniqueValues.insert(Blend->getIncomingValue(0)); - for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) - if (!match(Blend->getMask(I), m_False())) - UniqueValues.insert(Blend->getIncomingValue(I)); - - if (UniqueValues.size() == 1) { - Blend->replaceAllUsesWith(*UniqueValues.begin()); - Blend->eraseFromParent(); - return; - } - - if (Blend->isNormalized()) - return; - - // Normalize the blend so its first incoming value is used as the initial - // value with the others blended into it. - - unsigned StartIndex = 0; - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - // If a value's mask is used only by the blend then is can be deadcoded. - // TODO: Find the most expensive mask that can be deadcoded, or a mask - // that's used by multiple blends where it can be removed from them all. - VPValue *Mask = Blend->getMask(I); - if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { - StartIndex = I; - break; - } - } - - SmallVector<VPValue *, 4> OperandsWithMask; - OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); - - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - if (I == StartIndex) - continue; - OperandsWithMask.push_back(Blend->getIncomingValue(I)); - OperandsWithMask.push_back(Blend->getMask(I)); - } - - auto *NewBlend = new VPBlendRecipe( - cast<PHINode>(Blend->getUnderlyingValue()), OperandsWithMask); - NewBlend->insertBefore(&R); - - VPValue *DeadMask = Blend->getMask(StartIndex); - Blend->replaceAllUsesWith(NewBlend); - Blend->eraseFromParent(); - recursivelyDeleteDeadRecipes(DeadMask); - return; - } - - VPValue *A; - if (match(&R, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) { - VPValue *Trunc = R.getVPSingleValue(); - Type *TruncTy = TypeInfo.inferScalarType(Trunc); - Type *ATy = TypeInfo.inferScalarType(A); - if (TruncTy == ATy) { - Trunc->replaceAllUsesWith(A); - } else { - // Don't replace a scalarizing recipe with a widened cast. - if (isa<VPReplicateRecipe>(&R)) - return; - if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) { - - unsigned ExtOpcode = match(R.getOperand(0), m_SExt(m_VPValue())) - ? Instruction::SExt - : Instruction::ZExt; - auto *VPC = - new VPWidenCastRecipe(Instruction::CastOps(ExtOpcode), A, TruncTy); - if (auto *UnderlyingExt = R.getOperand(0)->getUnderlyingValue()) { - // UnderlyingExt has distinct return type, used to retain legacy cost. - VPC->setUnderlyingValue(UnderlyingExt); - } - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) { - auto *VPC = new VPWidenCastRecipe(Instruction::Trunc, A, TruncTy); - VPC->insertBefore(&R); - Trunc->replaceAllUsesWith(VPC); - } - } -#ifndef NDEBUG - // Verify that the cached type info is for both A and its users is still - // accurate by comparing it to freshly computed types. - VPTypeAnalysis TypeInfo2( - R.getParent()->getPlan()->getCanonicalIV()->getScalarType()); - assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A)); - for (VPUser *U : A->users()) { - auto *R = cast<VPRecipeBase>(U); - for (VPValue *VPV : R->definedValues()) - assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV)); - } -#endif - } - - // Simplify (X && Y) || (X && !Y) -> X. - // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X - // && (Y || Z) and (X || !X) into true. This requires queuing newly created - // recipes to be visited during simplification. - VPValue *X, *Y, *X1, *Y1; - if (match(&R, - m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), - m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && - X == X1 && Y == Y1) { - R.getVPSingleValue()->replaceAllUsesWith(X); - R.eraseFromParent(); - return; - } - - if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) - return R.getVPSingleValue()->replaceAllUsesWith(A); - - if (match(&R, m_Not(m_Not(m_VPValue(A))))) - return R.getVPSingleValue()->replaceAllUsesWith(A); - - // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. - if ((match(&R, - m_DerivedIV(m_SpecificInt(0), m_VPValue(A), m_SpecificInt(1))) || - match(&R, - m_DerivedIV(m_SpecificInt(0), m_SpecificInt(0), m_VPValue()))) && - TypeInfo.inferScalarType(R.getOperand(1)) == - TypeInfo.inferScalarType(R.getVPSingleValue())) - return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); -} - /// Move loop-invariant recipes out of the vector loop region in \p Plan. static void licm(VPlan &Plan) { VPBasicBlock *Preheader = Plan.getVectorPreheader(); @@ -1106,19 +1180,6 @@ static void licm(VPlan &Plan) { } } -/// Try to simplify the recipes in \p Plan. -static void simplifyRecipes(VPlan &Plan) { - ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( - Plan.getEntry()); - Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType(); - VPTypeAnalysis TypeInfo(CanonicalIVType); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - simplifyRecipe(R, TypeInfo); - } - } -} - void VPlanTransforms::truncateToMinimalBitwidths( VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) { #ifndef NDEBUG @@ -1256,10 +1317,10 @@ void VPlanTransforms::optimize(VPlan &Plan) { removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); legalizeAndOptimizeInductions(Plan); removeRedundantExpandSCEVRecipes(Plan); - simplifyRecipes(Plan); + simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType()); removeDeadRecipes(Plan); createAndOptimizeReplicateRegions(Plan); @@ -1496,10 +1557,13 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, auto *CastR = cast<VPWidenCastRecipe>(CR); VPID = VPIntrinsic::getForOpcode(CastR->getOpcode()); } - assert(VPID != Intrinsic::not_intrinsic && "Expected VP intrinsic"); + + // Not all intrinsics have a corresponding VP intrinsic. + if (VPID == Intrinsic::not_intrinsic) + return nullptr; assert(VPIntrinsic::getMaskParamPos(VPID) && VPIntrinsic::getVectorLengthParamPos(VPID) && - "Expected VP intrinsic"); + "Expected VP intrinsic to have mask and EVL"); SmallVector<VPValue *> Ops(CR->operands()); Ops.push_back(&AllOneMask); @@ -1656,9 +1720,9 @@ bool VPlanTransforms::tryAddExplicitVectorLength( VPSingleDefRecipe *OpVPEVL = VPEVL; if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits(); IVSize != 32) { - OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc - : Instruction::ZExt, - OpVPEVL, CanonicalIVPHI->getScalarType()); + OpVPEVL = new VPScalarCastRecipe( + IVSize < 32 ? Instruction::Trunc : Instruction::ZExt, OpVPEVL, + CanonicalIVPHI->getScalarType(), CanonicalIVIncrement->getDebugLoc()); OpVPEVL->insertBefore(CanonicalIVIncrement); } auto *NextEVLIV = @@ -1898,7 +1962,7 @@ void VPlanTransforms::handleUncountableEarlyExit( if (OrigLoop->getUniqueExitBlock()) { VPEarlyExitBlock = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]); } else { - VPEarlyExitBlock = VPIRBasicBlock::fromBasicBlock( + VPEarlyExitBlock = Plan.createVPIRBasicBlock( !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); } @@ -1908,7 +1972,7 @@ void VPlanTransforms::handleUncountableEarlyExit( IsEarlyExitTaken = Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); - VPBasicBlock *NewMiddle = new VPBasicBlock("middle.split"); + VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock); NewMiddle->swapSuccessors(); |
