diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 202 |
1 files changed, 111 insertions, 91 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0a23bf251676..bb0270c018c9 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -176,6 +176,27 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { return 8; } +bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const { + if (!ST->hasCF()) + return false; + if (!Ty) + return true; + // Conditional faulting is supported by CFCMOV, which only accepts + // 16/32/64-bit operands. + // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's + // profitable. + if (!Ty->isIntegerTy()) + return false; + switch (cast<IntegerType>(Ty)->getBitWidth()) { + default: + return false; + case 16: + case 32: + case 64: + return true; + } +} + TypeSize X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { unsigned PreferVectorWidth = ST->getPreferVectorWidth(); @@ -851,7 +872,9 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw - { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } }, + { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc + { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw + { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb @@ -1117,7 +1140,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack - { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw + { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld @@ -1168,7 +1191,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. - { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split + { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split + { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld @@ -1308,7 +1332,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. - { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) }; @@ -1317,6 +1340,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( if (auto KindCost = Entry->Cost[CostKind]) return LT.first * *KindCost; + static const CostKindTblEntry SSSE3CostTable[] = { + { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second)) + if (auto KindCost = Entry->Cost[CostKind]) + return LT.first * *KindCost; + static const CostKindTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. @@ -1353,7 +1385,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq - { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack + { ISD::MUL, MVT::v16i8, { 6, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add @@ -4061,7 +4093,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) }; static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets - { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV + { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV @@ -4082,9 +4114,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto }; static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets - { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV - { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV - { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA + { ISD::ABS, MVT::i32, { 1, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV + { ISD::ABS, MVT::i16, { 2, 2, 3, 3 } }, // SUB+XOR+SRA or SUB+CMOV + { ISD::ABS, MVT::i8, { 2, 4, 4, 3 } }, // SUB+XOR+SRA { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } }, { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } }, { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, @@ -4259,6 +4291,37 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } if (ISD != ISD::DELETED_NODE) { + auto adjustTableCost = [&](int ISD, unsigned Cost, + std::pair<InstructionCost, MVT> LT, + FastMathFlags FMF) -> InstructionCost { + InstructionCost LegalizationCost = LT.first; + MVT MTy = LT.second; + + // If there are no NANs to deal with, then these are reduced to a + // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we + // assume is used in the non-fast case. + if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { + if (FMF.noNaNs()) + return LegalizationCost * 1; + } + + // For cases where some ops can be folded into a load/store, assume free. + if (MTy.isScalarInteger()) { + if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { + if (const Instruction *II = ICA.getInst()) { + if (II->hasOneUse() && isa<StoreInst>(II->user_back())) + return TTI::TCC_Free; + if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { + if (LI->hasOneUse()) + return TTI::TCC_Free; + } + } + } + } + + return LegalizationCost * (int)Cost; + }; + // Legalize the type. std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; @@ -4277,180 +4340,132 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) return LT.first; - auto adjustTableCost = [](int ISD, unsigned Cost, - InstructionCost LegalizationCost, - FastMathFlags FMF) { - // If there are no NANs to deal with, then these are reduced to a - // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we - // assume is used in the non-fast case. - if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { - if (FMF.noNaNs()) - return LegalizationCost * 1; - } - return LegalizationCost * (int)Cost; - }; - if (ST->useGLMDivSqrtCosts()) if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->useSLMArithCosts()) if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasVBMI2()) if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBITALG()) if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasVPOPCNTDQ()) if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasGFNI()) if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasCDI()) if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBWI()) if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasXOP()) if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE42()) if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE41()) if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSSE3()) if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasSSE1()) if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (ST->hasBMI()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->hasLZCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->hasPOPCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); - } - - if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { - if (const Instruction *II = ICA.getInst()) { - if (II->hasOneUse() && isa<StoreInst>(II->user_back())) - return TTI::TCC_Free; - if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { - if (LI->hasOneUse()) - return TTI::TCC_Free; - } - } + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, - ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) - return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); + return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); } return BaseT::getIntrinsicInstrCost(ICA, CostKind); @@ -5076,7 +5091,12 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); auto VT = TLI->getValueType(DL, SrcVTy); InstructionCost Cost = 0; - if (VT.isSimple() && LT.second != VT.getSimpleVT() && + MVT Ty = LT.second; + if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) + // APX masked load/store for scalar is cheap. + return Cost + LT.first; + + if (VT.isSimple() && Ty != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires extend/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, @@ -5084,9 +5104,9 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, CostKind, 0, nullptr); - else if (LT.first * LT.second.getVectorNumElements() > NumElem) { + else if (LT.first * Ty.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), - LT.second.getVectorNumElements()); + Ty.getVectorNumElements()); // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, CostKind, 0, MaskTy); @@ -5905,14 +5925,14 @@ bool X86TTIImpl::canMacroFuseCmp() { } bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { - if (!ST->hasAVX()) - return false; + Type *ScalarTy = DataTy->getScalarType(); - // The backend can't handle a single element vector. - if (isa<VectorType>(DataTy) && - cast<FixedVectorType>(DataTy)->getNumElements() == 1) + // The backend can't handle a single element vector w/o CFCMOV. + if (isa<VectorType>(DataTy) && cast<FixedVectorType>(DataTy)->getNumElements() == 1) + return ST->hasCF() && hasConditionalLoadStoreForType(ScalarTy); + + if (!ST->hasAVX()) return false; - Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) return true; |
