diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 410 |
1 files changed, 362 insertions, 48 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index c4fdcccc6d62..bf5148954309 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const { return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory(); + case VPInterleaveEVLSC: case VPInterleaveSC: - return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0; + return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: case VPWidenStoreSC: return true; @@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return false; } default: + // FIXME: Return false if the recipe represents an interleaved store. return true; } } @@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { "underlying instruction has side-effects"); return false; } + case VPInterleaveEVLSC: case VPInterleaveSC: return mayWriteToMemory(); case VPWidenLoadEVLSC: @@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) { Instruction *UI = nullptr; if (auto *S = dyn_cast<VPSingleDefRecipe>(this)) UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); - else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this)) + else if (auto *IG = dyn_cast<VPInterleaveBase>(this)) UI = IG->getInsertPos(); else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this)) UI = &WidenMem->getIngredient(); @@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPIRFlags::intersectFlags(const VPIRFlags &Other) { + assert(OpType == Other.OpType && "OpType must match"); + switch (OpType) { + case OperationType::OverflowingBinOp: + WrapFlags.HasNUW &= Other.WrapFlags.HasNUW; + WrapFlags.HasNSW &= Other.WrapFlags.HasNSW; + break; + case OperationType::Trunc: + TruncFlags.HasNUW &= Other.TruncFlags.HasNUW; + TruncFlags.HasNSW &= Other.TruncFlags.HasNSW; + break; + case OperationType::DisjointOp: + DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint; + break; + case OperationType::PossiblyExactOp: + ExactFlags.IsExact &= Other.ExactFlags.IsExact; + break; + case OperationType::GEPOp: + GEPFlags &= Other.GEPFlags; + break; + case OperationType::FPMathOp: + FMFs.NoNaNs &= Other.FMFs.NoNaNs; + FMFs.NoInfs &= Other.FMFs.NoInfs; + break; + case OperationType::NonNegOp: + NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg; + break; + case OperationType::Cmp: + assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate"); + break; + case OperationType::Other: + assert(AllFlags == Other.AllFlags && "Cannot drop other flags"); + break; + } +} + FastMathFlags VPIRFlags::getFastMathFlags() const { assert(OpType == OperationType::FPMathOp && "recipe doesn't have fast math flags"); @@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: case VPInstruction::FirstOrderRecurrenceSplice: @@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::AnyOf: { - Value *Res = State.get(getOperand(0)); + Value *Res = Builder.CreateFreeze(State.get(getOperand(0))); for (VPValue *Op : drop_begin(operands())) - Res = Builder.CreateOr(Res, State.get(Op)); + Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op))); return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::ExtractLane: { @@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) { unsigned LastOpIdx = getNumOperands() - 1; Value *Res = nullptr; for (int Idx = LastOpIdx; Idx >= 0; --Idx) { - Value *TrailingZeros = Builder.CreateCountTrailingZeroElems( - Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name); + Value *TrailingZeros = + State.VF.isScalar() + ? Builder.CreateZExt( + Builder.CreateICmpEQ(State.get(getOperand(Idx)), + Builder.getFalse()), + Builder.getInt64Ty()) + : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), + State.get(getOperand(Idx)), + true, Name); Value *Current = Builder.CreateAdd( Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros); if (Res) { @@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } switch (getOpcode()) { + case Instruction::Select: { + // TODO: It may be possible to improve this by analyzing where the + // condition operand comes from. + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *CondTy = Ctx.Types.inferScalarType(getOperand(0)); + auto *VecTy = Ctx.Types.inferScalarType(getOperand(1)); + if (!vputils::onlyFirstLaneUsed(this)) { + CondTy = toVectorTy(CondTy, VF); + VecTy = toVectorTy(VecTy, VF); + } + return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred, + Ctx.CostKind); + } case Instruction::ExtractElement: case VPInstruction::ExtractLane: { + if (VF.isScalar()) { + // ExtractLane with VF=1 takes care of handling extracting across multiple + // parts. + return 0; + } + // Add on the cost of extracting the element. auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, @@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind); } case VPInstruction::FirstActiveLane: { + Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0)); + if (VF.isScalar()) + return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy), + CmpInst::ICMP_EQ, Ctx.CostKind); // Calculate the cost of determining the lane index. - auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF); + auto *PredTy = toVectorTy(ScalarTy, VF); IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx), {PredTy, Type::getInt1Ty(Ctx.LLVMCtx)}); @@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } case VPInstruction::ActiveLaneMask: { Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); - Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + unsigned Multiplier = + cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue(); + Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, {ArgTy, ArgTy}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); @@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { State.set(this, V); } -InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R. +static InstructionCost getCostForIntrinsics(Intrinsic::ID ID, + ArrayRef<const VPValue *> Operands, + const VPRecipeWithIRFlags &R, + ElementCount VF, + VPCostContext &Ctx) { // Some backends analyze intrinsic arguments to determine cost. Use the // underlying value for the operand if it has one. Otherwise try to use the // operand of the underlying call instruction, if there is one. Otherwise // clear Arguments. // TODO: Rework TTI interface to be independent of concrete IR values. SmallVector<const Value *> Arguments; - for (const auto &[Idx, Op] : enumerate(operands())) { + for (const auto &[Idx, Op] : enumerate(Operands)) { auto *V = Op->getUnderlyingValue(); if (!V) { - if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) { + if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) { Arguments.push_back(UI->getArgOperand(Idx)); continue; } @@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, Arguments.push_back(V); } - Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF); + Type *ScalarRetTy = Ctx.Types.inferScalarType(&R); + Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy; SmallVector<Type *> ParamTys; - for (unsigned I = 0; I != getNumOperands(); ++I) - ParamTys.push_back( - toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF)); + for (const VPValue *Op : Operands) { + ParamTys.push_back(VF.isVector() + ? toVectorTy(Ctx.Types.inferScalarType(Op), VF) + : Ctx.Types.inferScalarType(Op)); + } // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst. - FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(); + FastMathFlags FMF = + R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags(); IntrinsicCostAttributes CostAttrs( - VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF, - dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()), + ID, RetTy, Arguments, ParamTys, FMF, + dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()), InstructionCost::getInvalid(), &Ctx.TLI); return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind); } +InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + SmallVector<const VPValue *> ArgOps(operands()); + return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx); +} + StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const { return Intrinsic::getBaseName(VectorIntrinsicID); } @@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: - // More complex computation, let the legacy cost-model handle this for now. - return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF); + // If the div/rem operation isn't safe to speculate and requires + // predication, then the only way we can even create a vplan is to insert + // a select on the second input operand to ensure we use the value of 1 + // for the inactive lanes. The select will be costed separately. case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint { if (VF.isScalar()) return TTI::CastContextHint::Normal; - if (isa<VPInterleaveRecipe>(R)) + if (isa<VPInterleaveBase>(R)) return TTI::CastContextHint::Interleave; if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R)) return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked @@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)); assert(RedTy->isIntegerTy() && "VPExpressionRecipe only supports integer types currently."); + unsigned Opcode = RecurrenceDescriptor::getOpcode( + cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind()); switch (ExpressionType) { case ExpressionTypes::ExtendedReduction: { - unsigned Opcode = RecurrenceDescriptor::getOpcode( - cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind()); return Ctx.TTI.getExtendedReductionCost( Opcode, cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == @@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF, RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); } case ExpressionTypes::MulAccReduction: - return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind); + return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy, + Ctx.CostKind); case ExpressionTypes::ExtMulAccReduction: return Ctx.TTI.getMulAccReductionCost( cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() == Instruction::ZExt, - RedTy, SrcVecTy, Ctx.CostKind); + Opcode, RedTy, SrcVecTy, Ctx.CostKind); } llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum"); } @@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // instruction cost. return 0; case Instruction::Call: { - if (!isSingleScalar()) { - // TODO: Handle remaining call costs here as well. - if (VF.isScalable()) - return InstructionCost::getInvalid(); - break; - } - auto *CalledFn = cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); - if (CalledFn->isIntrinsic()) - break; + SmallVector<const VPValue *> ArgOps(drop_end(operands())); SmallVector<Type *, 4> Tys; - for (VPValue *ArgOp : drop_end(operands())) + for (const VPValue *ArgOp : ArgOps) Tys.push_back(Ctx.Types.inferScalarType(ArgOp)); + + if (CalledFn->isIntrinsic()) + // Various pseudo-intrinsics with costs of 0 are scalarized instead of + // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early. + switch (CalledFn->getIntrinsicID()) { + case Intrinsic::assume: + case Intrinsic::lifetime_end: + case Intrinsic::lifetime_start: + case Intrinsic::sideeffect: + case Intrinsic::pseudoprobe: + case Intrinsic::experimental_noalias_scope_decl: { + assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx) == 0 && + "scalarizing intrinsic should be free"); + return InstructionCost(0); + } + default: + break; + } + Type *ResultTy = Ctx.Types.inferScalarType(this); - return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + InstructionCost ScalarCallCost = + Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind); + if (isSingleScalar()) { + if (CalledFn->isIntrinsic()) + ScalarCallCost = std::min( + ScalarCallCost, + getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this, + ElementCount::getFixed(1), Ctx)); + return ScalarCallCost; + } + + if (VF.isScalable()) + return InstructionCost::getInvalid(); + + // Compute the cost of scalarizing the result and operands if needed. + InstructionCost ScalarizationCost = 0; + if (VF.isVector()) { + if (!ResultTy->isVoidTy()) { + for (Type *VectorTy : + to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) { + ScalarizationCost += Ctx.TTI.getScalarizationOverhead( + cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert=*/true, + /*Extract=*/false, Ctx.CostKind); + } + } + // Skip operands that do not require extraction/scalarization and do not + // incur any overhead. + SmallPtrSet<const VPValue *, 4> UniqueOperands; + Tys.clear(); + for (auto *Op : ArgOps) { + if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) || + !UniqueOperands.insert(Op).second) + continue; + Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF)); + } + ScalarizationCost += + Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind); + } + + return ScalarCallCost * VF.getFixedValue() + ScalarizationCost; } case Instruction::Add: case Instruction::Sub: @@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: { + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1), Ctx) * (isSingleScalar() ? 1 : VF.getFixedValue()); + case Instruction::Load: + case Instruction::Store: { + if (isSingleScalar()) { + bool IsLoad = UI->getOpcode() == Instruction::Load; + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); + return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); + } + // TODO: See getMemInstScalarizationCost for how to handle replicating and + // predicated cases. + break; } } @@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, // TODO: Using the original IR may not be accurate. // Currently, ARM will use the underlying IR to calculate gather/scatter // instruction cost. - const Value *Ptr = getLoadStorePointerOperand(&Ingredient); - Type *PtrTy = toVectorTy(Ptr->getType(), VF); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + const Value *Ptr = getLoadStorePointerOperand(&Ingredient); + Type *PtrTy = Ptr->getType(); + + // If the address value is uniform across all lanes, then the address can be + // calculated with scalar type and broadcast. + if (!vputils::isSingleScalar(getAddr())) + PtrTy = toVectorTy(PtrTy, VF); + return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr, Ctx.CostKind) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, @@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Lane && "Interleave group being replicated."); - assert((!NeedsMaskForGaps || !State.VF.isScalable()) && + assert((!needsMaskForGaps() || !State.VF.isScalable()) && "Masking gaps for scalable vectors is not yet supported."); - const InterleaveGroup<Instruction> *Group = IG; + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); Instruction *Instr = Group->getInsertPos(); // Prepare for the vector type of the interleaved load/store. @@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { Value *MaskForGaps = nullptr; - if (NeedsMaskForGaps) { + if (needsMaskForGaps()) { MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); @@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { // Vectorize the interleaved store group. Value *MaskForGaps = createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group); - assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) && + assert(((MaskForGaps != nullptr) == needsMaskForGaps()) && "Mismatch between NeedsMaskForGaps and MaskForGaps"); ArrayRef<VPValue *> StoredValues = getStoredValues(); // Collect the stored vector from each member. @@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; @@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { +void VPInterleaveEVLRecipe::execute(VPTransformState &State) { + assert(!State.Lane && "Interleave group being replicated."); + assert(State.VF.isScalable() && + "Only support scalable VF for EVL tail-folding."); + assert(!needsMaskForGaps() && + "Masking gaps for scalable vectors is not yet supported."); + const InterleaveGroup<Instruction> *Group = getInterleaveGroup(); + Instruction *Instr = Group->getInsertPos(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = getLoadStoreType(Instr); + unsigned InterleaveFactor = Group->getFactor(); + assert(InterleaveFactor <= 8 && + "Unsupported deinterleave/interleave factor for scalable vectors"); + ElementCount WideVF = State.VF * InterleaveFactor; + auto *VecTy = VectorType::get(ScalarTy, WideVF); + + VPValue *Addr = getAddr(); + Value *ResAddr = State.get(Addr, VPLane(0)); + Value *EVL = State.get(getEVL(), VPLane(0)); + Value *InterleaveEVL = State.Builder.CreateMul( + EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl", + /* NUW= */ true, /* NSW= */ true); + LLVMContext &Ctx = State.Builder.getContext(); + + Value *GroupMask = nullptr; + if (VPValue *BlockInMask = getMask()) { + SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask)); + GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask"); + } else { + GroupMask = + State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue()); + } + + // Vectorize the interleaved load group. + if (isa<LoadInst>(Instr)) { + CallInst *NewLoad = State.Builder.CreateIntrinsic( + VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr, + "wide.vp.load"); + NewLoad->addParamAttr(0, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewLoad); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewLoad); + + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + NewLoad = State.Builder.CreateIntrinsic( + Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor), + NewLoad->getType(), NewLoad, + /*FMFSource=*/nullptr, "strided.vec"); + + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) + continue; + + Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I); + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); + StridedVec = + createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL); + } + + State.set(getVPValue(J), StridedVec); + ++J; + } + return; + } // End for interleaved load. + + // The sub vector type for current instruction. + auto *SubVT = VectorType::get(ScalarTy, State.VF); + // Vectorize the interleaved store group. + ArrayRef<VPValue *> StoredValues = getStoredValues(); + // Collect the stored vector from each member. + SmallVector<Value *, 4> StoredVecs; + const DataLayout &DL = Instr->getDataLayout(); + for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) { + Instruction *Member = Group->getMember(I); + // Skip the gaps in the group. + if (!Member) { + StoredVecs.push_back(PoisonValue::get(SubVT)); + continue; + } + + Value *StoredVec = State.get(StoredValues[StoredIdx]); + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL); + + StoredVecs.push_back(StoredVec); + ++StoredIdx; + } + + // Interleave all the smaller vectors into one wider vector. + Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec"); + CallInst *NewStore = + State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store, + {IVec, ResAddr, GroupMask, InterleaveEVL}); + NewStore->addParamAttr(1, + Attribute::getWithAlignment(Ctx, Group->getAlign())); + + applyMetadata(*NewStore); + // TODO: Also manage existing metadata using VPIRMetadata. + Group->addMetadata(NewStore); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + const InterleaveGroup<Instruction> *IG = getInterleaveGroup(); + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + IG->getInsertPos()->printAsOperand(O, false); + O << ", "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", "; + getEVL()->printAsOperand(O, SlotTracker); + if (VPValue *Mask = getMask()) { + O << ", "; + Mask->printAsOperand(O, SlotTracker); + } + + unsigned OpIdx = 0; + for (unsigned i = 0; i < IG->getFactor(); ++i) { + if (!IG->getMember(i)) + continue; + if (getNumStoreOperands() > 0) { + O << "\n" << Indent << " vp.store "; + getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker); + O << " to index " << i; + } else { + O << "\n" << Indent << " "; + getVPValue(OpIdx)->printAsOperand(O, SlotTracker); + O << " = vp.load from index " << i; + } + ++OpIdx; + } +} +#endif + +InstructionCost VPInterleaveBase::computeCost(ElementCount VF, + VPCostContext &Ctx) const { Instruction *InsertPos = getInsertPos(); // Find the VPValue index of the interleave group. We need to skip gaps. unsigned InsertPosIdx = 0; |
