summaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp410
1 files changed, 362 insertions, 48 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c4fdcccc6d62..bf5148954309 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -52,8 +52,9 @@ bool VPRecipeBase::mayWriteToMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
- return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
+ return cast<VPInterleaveBase>(this)->getNumStoreOperands() > 0;
case VPWidenStoreEVLSC:
case VPWidenStoreSC:
return true;
@@ -142,6 +143,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return false;
}
default:
+ // FIXME: Return false if the recipe represents an interleaved store.
return true;
}
}
@@ -183,6 +185,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
+ case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenLoadEVLSC:
@@ -255,7 +258,7 @@ InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
Instruction *UI = nullptr;
if (auto *S = dyn_cast<VPSingleDefRecipe>(this))
UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
- else if (auto *IG = dyn_cast<VPInterleaveRecipe>(this))
+ else if (auto *IG = dyn_cast<VPInterleaveBase>(this))
UI = IG->getInsertPos();
else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(this))
UI = &WidenMem->getIngredient();
@@ -389,6 +392,42 @@ void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
+ assert(OpType == Other.OpType && "OpType must match");
+ switch (OpType) {
+ case OperationType::OverflowingBinOp:
+ WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
+ WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
+ break;
+ case OperationType::Trunc:
+ TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
+ TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
+ break;
+ case OperationType::DisjointOp:
+ DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
+ break;
+ case OperationType::PossiblyExactOp:
+ ExactFlags.IsExact &= Other.ExactFlags.IsExact;
+ break;
+ case OperationType::GEPOp:
+ GEPFlags &= Other.GEPFlags;
+ break;
+ case OperationType::FPMathOp:
+ FMFs.NoNaNs &= Other.FMFs.NoNaNs;
+ FMFs.NoInfs &= Other.FMFs.NoInfs;
+ break;
+ case OperationType::NonNegOp:
+ NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
+ break;
+ case OperationType::Cmp:
+ assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
+ break;
+ case OperationType::Other:
+ assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
+ break;
+ }
+}
+
FastMathFlags VPIRFlags::getFastMathFlags() const {
assert(OpType == OperationType::FPMathOp &&
"recipe doesn't have fast math flags");
@@ -471,7 +510,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::ICmp:
case Instruction::FCmp:
case Instruction::Store:
- case VPInstruction::ActiveLaneMask:
case VPInstruction::BranchOnCount:
case VPInstruction::ComputeReductionResult:
case VPInstruction::FirstOrderRecurrenceSplice:
@@ -481,6 +519,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case VPInstruction::WideIVStep:
return 2;
case Instruction::Select:
+ case VPInstruction::ActiveLaneMask:
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ReductionStartVector:
return 3;
@@ -620,7 +659,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
Name);
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
+ auto PredTy = VectorType::get(
+ Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
+ ->getZExtValue());
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
@@ -875,9 +916,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::AnyOf: {
- Value *Res = State.get(getOperand(0));
+ Value *Res = Builder.CreateFreeze(State.get(getOperand(0)));
for (VPValue *Op : drop_begin(operands()))
- Res = Builder.CreateOr(Res, State.get(Op));
+ Res = Builder.CreateOr(Res, Builder.CreateFreeze(State.get(Op)));
return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
}
case VPInstruction::ExtractLane: {
@@ -919,8 +960,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
unsigned LastOpIdx = getNumOperands() - 1;
Value *Res = nullptr;
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
- Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
- Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
+ Value *TrailingZeros =
+ State.VF.isScalar()
+ ? Builder.CreateZExt(
+ Builder.CreateICmpEQ(State.get(getOperand(Idx)),
+ Builder.getFalse()),
+ Builder.getInt64Ty())
+ : Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
+ State.get(getOperand(Idx)),
+ true, Name);
Value *Current = Builder.CreateAdd(
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
if (Res) {
@@ -1027,8 +1075,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
switch (getOpcode()) {
+ case Instruction::Select: {
+ // TODO: It may be possible to improve this by analyzing where the
+ // condition operand comes from.
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ auto *CondTy = Ctx.Types.inferScalarType(getOperand(0));
+ auto *VecTy = Ctx.Types.inferScalarType(getOperand(1));
+ if (!vputils::onlyFirstLaneUsed(this)) {
+ CondTy = toVectorTy(CondTy, VF);
+ VecTy = toVectorTy(VecTy, VF);
+ }
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, Pred,
+ Ctx.CostKind);
+ }
case Instruction::ExtractElement:
case VPInstruction::ExtractLane: {
+ if (VF.isScalar()) {
+ // ExtractLane with VF=1 takes care of handling extracting across multiple
+ // parts.
+ return 0;
+ }
+
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -1040,8 +1107,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::FirstActiveLane: {
+ Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
+ if (VF.isScalar())
+ return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy),
+ CmpInst::ICMP_EQ, Ctx.CostKind);
// Calculate the cost of determining the lane index.
- auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
+ auto *PredTy = toVectorTy(ScalarTy, VF);
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
Type::getInt64Ty(Ctx.LLVMCtx),
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
@@ -1060,7 +1132,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
}
case VPInstruction::ActiveLaneMask: {
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
- Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
+ unsigned Multiplier =
+ cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
+ Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
{ArgTy, ArgTy});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
@@ -1684,18 +1758,22 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
State.set(this, V);
}
-InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
+static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
+ ArrayRef<const VPValue *> Operands,
+ const VPRecipeWithIRFlags &R,
+ ElementCount VF,
+ VPCostContext &Ctx) {
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
- for (const auto &[Idx, Op] : enumerate(operands())) {
+ for (const auto &[Idx, Op] : enumerate(Operands)) {
auto *V = Op->getUnderlyingValue();
if (!V) {
- if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
+ if (auto *UI = dyn_cast_or_null<CallBase>(R.getUnderlyingValue())) {
Arguments.push_back(UI->getArgOperand(Idx));
continue;
}
@@ -1705,21 +1783,31 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
Arguments.push_back(V);
}
- Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
+ Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
+ Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
SmallVector<Type *> ParamTys;
- for (unsigned I = 0; I != getNumOperands(); ++I)
- ParamTys.push_back(
- toVectorTy(Ctx.Types.inferScalarType(getOperand(I)), VF));
+ for (const VPValue *Op : Operands) {
+ ParamTys.push_back(VF.isVector()
+ ? toVectorTy(Ctx.Types.inferScalarType(Op), VF)
+ : Ctx.Types.inferScalarType(Op));
+ }
// TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
- FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
+ FastMathFlags FMF =
+ R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
IntrinsicCostAttributes CostAttrs(
- VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
- dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()),
+ ID, RetTy, Arguments, ParamTys, FMF,
+ dyn_cast_or_null<IntrinsicInst>(R.getUnderlyingValue()),
InstructionCost::getInvalid(), &Ctx.TLI);
return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
+InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
+ SmallVector<const VPValue *> ArgOps(operands());
+ return getCostForIntrinsics(VectorIntrinsicID, ArgOps, *this, VF, Ctx);
+}
+
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
return Intrinsic::getBaseName(VectorIntrinsicID);
}
@@ -2110,8 +2198,10 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- // More complex computation, let the legacy cost-model handle this for now.
- return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
+ // If the div/rem operation isn't safe to speculate and requires
+ // predication, then the only way we can even create a vplan is to insert
+ // a select on the second input operand to ensure we use the value of 1
+ // for the inactive lanes. The select will be costed separately.
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
@@ -2174,7 +2264,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
if (VF.isScalar())
return TTI::CastContextHint::Normal;
- if (isa<VPInterleaveRecipe>(R))
+ if (isa<VPInterleaveBase>(R))
return TTI::CastContextHint::Interleave;
if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(R))
return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
@@ -2756,10 +2846,10 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF));
assert(RedTy->isIntegerTy() &&
"VPExpressionRecipe only supports integer types currently.");
+ unsigned Opcode = RecurrenceDescriptor::getOpcode(
+ cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
switch (ExpressionType) {
case ExpressionTypes::ExtendedReduction: {
- unsigned Opcode = RecurrenceDescriptor::getOpcode(
- cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
return Ctx.TTI.getExtendedReductionCost(
Opcode,
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
@@ -2767,13 +2857,14 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
}
case ExpressionTypes::MulAccReduction:
- return Ctx.TTI.getMulAccReductionCost(false, RedTy, SrcVecTy, Ctx.CostKind);
+ return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
+ Ctx.CostKind);
case ExpressionTypes::ExtMulAccReduction:
return Ctx.TTI.getMulAccReductionCost(
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
Instruction::ZExt,
- RedTy, SrcVecTy, Ctx.CostKind);
+ Opcode, RedTy, SrcVecTy, Ctx.CostKind);
}
llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
}
@@ -3014,23 +3105,75 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// instruction cost.
return 0;
case Instruction::Call: {
- if (!isSingleScalar()) {
- // TODO: Handle remaining call costs here as well.
- if (VF.isScalable())
- return InstructionCost::getInvalid();
- break;
- }
-
auto *CalledFn =
cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue());
- if (CalledFn->isIntrinsic())
- break;
+ SmallVector<const VPValue *> ArgOps(drop_end(operands()));
SmallVector<Type *, 4> Tys;
- for (VPValue *ArgOp : drop_end(operands()))
+ for (const VPValue *ArgOp : ArgOps)
Tys.push_back(Ctx.Types.inferScalarType(ArgOp));
+
+ if (CalledFn->isIntrinsic())
+ // Various pseudo-intrinsics with costs of 0 are scalarized instead of
+ // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
+ switch (CalledFn->getIntrinsicID()) {
+ case Intrinsic::assume:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::sideeffect:
+ case Intrinsic::pseudoprobe:
+ case Intrinsic::experimental_noalias_scope_decl: {
+ assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx) == 0 &&
+ "scalarizing intrinsic should be free");
+ return InstructionCost(0);
+ }
+ default:
+ break;
+ }
+
Type *ResultTy = Ctx.Types.inferScalarType(this);
- return Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ InstructionCost ScalarCallCost =
+ Ctx.TTI.getCallInstrCost(CalledFn, ResultTy, Tys, Ctx.CostKind);
+ if (isSingleScalar()) {
+ if (CalledFn->isIntrinsic())
+ ScalarCallCost = std::min(
+ ScalarCallCost,
+ getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
+ ElementCount::getFixed(1), Ctx));
+ return ScalarCallCost;
+ }
+
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+
+ // Compute the cost of scalarizing the result and operands if needed.
+ InstructionCost ScalarizationCost = 0;
+ if (VF.isVector()) {
+ if (!ResultTy->isVoidTy()) {
+ for (Type *VectorTy :
+ to_vector(getContainedTypes(toVectorizedTy(ResultTy, VF)))) {
+ ScalarizationCost += Ctx.TTI.getScalarizationOverhead(
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert=*/true,
+ /*Extract=*/false, Ctx.CostKind);
+ }
+ }
+ // Skip operands that do not require extraction/scalarization and do not
+ // incur any overhead.
+ SmallPtrSet<const VPValue *, 4> UniqueOperands;
+ Tys.clear();
+ for (auto *Op : ArgOps) {
+ if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+ !UniqueOperands.insert(Op).second)
+ continue;
+ Tys.push_back(toVectorizedTy(Ctx.Types.inferScalarType(Op), VF));
+ }
+ ScalarizationCost +=
+ Ctx.TTI.getOperandsScalarizationOverhead(Tys, Ctx.CostKind);
+ }
+
+ return ScalarCallCost * VF.getFixedValue() + ScalarizationCost;
}
case Instruction::Add:
case Instruction::Sub:
@@ -3045,10 +3188,29 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
return *getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
Ctx) *
(isSingleScalar() ? 1 : VF.getFixedValue());
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (isSingleScalar()) {
+ bool IsLoad = UI->getOpcode() == Instruction::Load;
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+ const Align Alignment = getLoadStoreAlignment(UI);
+ unsigned AS = getLoadStoreAddressSpace(UI);
+ TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+ InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+ UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+ return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+ ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+ }
+ // TODO: See getMemInstScalarizationCost for how to handle replicating and
+ // predicated cases.
+ break;
}
}
@@ -3181,10 +3343,17 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
// TODO: Using the original IR may not be accurate.
// Currently, ARM will use the underlying IR to calculate gather/scatter
// instruction cost.
- const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- Type *PtrTy = toVectorTy(Ptr->getType(), VF);
assert(!Reverse &&
"Inconsecutive memory access should not have the order.");
+
+ const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
+ Type *PtrTy = Ptr->getType();
+
+ // If the address value is uniform across all lanes, then the address can be
+ // calculated with scalar type and broadcast.
+ if (!vputils::isSingleScalar(getAddr()))
+ PtrTy = toVectorTy(PtrTy, VF);
+
return Ctx.TTI.getAddressComputationCost(PtrTy, nullptr, nullptr,
Ctx.CostKind) +
Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment,
@@ -3532,9 +3701,9 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Lane && "Interleave group being replicated.");
- assert((!NeedsMaskForGaps || !State.VF.isScalable()) &&
+ assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
"Masking gaps for scalable vectors is not yet supported.");
- const InterleaveGroup<Instruction> *Group = IG;
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
Instruction *Instr = Group->getInsertPos();
// Prepare for the vector type of the interleaved load/store.
@@ -3574,7 +3743,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
- if (NeedsMaskForGaps) {
+ if (needsMaskForGaps()) {
MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
@@ -3651,7 +3820,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
// Vectorize the interleaved store group.
Value *MaskForGaps =
createBitMaskForGaps(State.Builder, State.VF.getKnownMinValue(), *Group);
- assert(((MaskForGaps != nullptr) == NeedsMaskForGaps) &&
+ assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
"Mismatch between NeedsMaskForGaps and MaskForGaps");
ArrayRef<VPValue *> StoredValues = getStoredValues();
// Collect the stored vector from each member.
@@ -3702,6 +3871,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
@@ -3730,8 +3900,152 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
- VPCostContext &Ctx) const {
+void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
+ assert(!State.Lane && "Interleave group being replicated.");
+ assert(State.VF.isScalable() &&
+ "Only support scalable VF for EVL tail-folding.");
+ assert(!needsMaskForGaps() &&
+ "Masking gaps for scalable vectors is not yet supported.");
+ const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
+ Instruction *Instr = Group->getInsertPos();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getLoadStoreType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
+ assert(InterleaveFactor <= 8 &&
+ "Unsupported deinterleave/interleave factor for scalable vectors");
+ ElementCount WideVF = State.VF * InterleaveFactor;
+ auto *VecTy = VectorType::get(ScalarTy, WideVF);
+
+ VPValue *Addr = getAddr();
+ Value *ResAddr = State.get(Addr, VPLane(0));
+ Value *EVL = State.get(getEVL(), VPLane(0));
+ Value *InterleaveEVL = State.Builder.CreateMul(
+ EVL, ConstantInt::get(EVL->getType(), InterleaveFactor), "interleave.evl",
+ /* NUW= */ true, /* NSW= */ true);
+ LLVMContext &Ctx = State.Builder.getContext();
+
+ Value *GroupMask = nullptr;
+ if (VPValue *BlockInMask = getMask()) {
+ SmallVector<Value *> Ops(InterleaveFactor, State.get(BlockInMask));
+ GroupMask = interleaveVectors(State.Builder, Ops, "interleaved.mask");
+ } else {
+ GroupMask =
+ State.Builder.CreateVectorSplat(WideVF, State.Builder.getTrue());
+ }
+
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ CallInst *NewLoad = State.Builder.CreateIntrinsic(
+ VecTy, Intrinsic::vp_load, {ResAddr, GroupMask, InterleaveEVL}, nullptr,
+ "wide.vp.load");
+ NewLoad->addParamAttr(0,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewLoad);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewLoad);
+
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ NewLoad = State.Builder.CreateIntrinsic(
+ Intrinsic::getDeinterleaveIntrinsicID(InterleaveFactor),
+ NewLoad->getType(), NewLoad,
+ /*FMFSource=*/nullptr, "strided.vec");
+
+ const DataLayout &DL = Instr->getDataLayout();
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
+ Value *StridedVec = State.Builder.CreateExtractValue(NewLoad, I);
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
+ VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
+ StridedVec =
+ createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
+ }
+
+ State.set(getVPValue(J), StridedVec);
+ ++J;
+ }
+ return;
+ } // End for interleaved load.
+
+ // The sub vector type for current instruction.
+ auto *SubVT = VectorType::get(ScalarTy, State.VF);
+ // Vectorize the interleaved store group.
+ ArrayRef<VPValue *> StoredValues = getStoredValues();
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ const DataLayout &DL = Instr->getDataLayout();
+ for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
+ Instruction *Member = Group->getMember(I);
+ // Skip the gaps in the group.
+ if (!Member) {
+ StoredVecs.push_back(PoisonValue::get(SubVT));
+ continue;
+ }
+
+ Value *StoredVec = State.get(StoredValues[StoredIdx]);
+ // If this member has different type, cast it to a unified type.
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(State.Builder, StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ ++StoredIdx;
+ }
+
+ // Interleave all the smaller vectors into one wider vector.
+ Value *IVec = interleaveVectors(State.Builder, StoredVecs, "interleaved.vec");
+ CallInst *NewStore =
+ State.Builder.CreateIntrinsic(Type::getVoidTy(Ctx), Intrinsic::vp_store,
+ {IVec, ResAddr, GroupMask, InterleaveEVL});
+ NewStore->addParamAttr(1,
+ Attribute::getWithAlignment(Ctx, Group->getAlign()));
+
+ applyMetadata(*NewStore);
+ // TODO: Also manage existing metadata using VPIRMetadata.
+ Group->addMetadata(NewStore);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPInterleaveEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
+ O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << ", ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getEVL()->printAsOperand(O, SlotTracker);
+ if (VPValue *Mask = getMask()) {
+ O << ", ";
+ Mask->printAsOperand(O, SlotTracker);
+ }
+
+ unsigned OpIdx = 0;
+ for (unsigned i = 0; i < IG->getFactor(); ++i) {
+ if (!IG->getMember(i))
+ continue;
+ if (getNumStoreOperands() > 0) {
+ O << "\n" << Indent << " vp.store ";
+ getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
+ O << " to index " << i;
+ } else {
+ O << "\n" << Indent << " ";
+ getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
+ O << " = vp.load from index " << i;
+ }
+ ++OpIdx;
+ }
+}
+#endif
+
+InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
+ VPCostContext &Ctx) const {
Instruction *InsertPos = getInsertPos();
// Find the VPValue index of the interleave group. We need to skip gaps.
unsigned InsertPosIdx = 0;