diff options
Diffstat (limited to 'llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp | 125 |
1 files changed, 109 insertions, 16 deletions
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 08fb7586d215..0eefd3e2b350 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -166,12 +166,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost( CostKind); } - int ISD = TLI->InstructionOpcodeToISD(Opcode); - if (ISD != ISD::LOAD) { - return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, - CostKind); - } - EVT VT = TLI->getValueType(DL, Ty, true); // Type legalization can't handle structs if (VT == MVT::Other) @@ -182,22 +176,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost( if (!LT.first.isValid()) return InstructionCost::getInvalid(); - // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can - // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads - // are twice as expensive as scalar. + int ISD = TLI->InstructionOpcodeToISD(Opcode); unsigned width = VT.getSizeInBits(); - switch (width) { - default: - break; - case 32: - case 64: - case 128: - return 2; + if (ISD == ISD::LOAD) { + // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads + // can be lowered to load32_zero and load64_zero respectively. Assume SIMD + // loads are twice as expensive as scalar. + switch (width) { + default: + break; + case 32: + case 64: + case 128: + return 2; + } + } else if (ISD == ISD::STORE) { + // For stores, we can use store lane operations. + switch (width) { + default: + break; + case 8: + case 16: + case 32: + case 64: + case 128: + return 2; + } } return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind); } +InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) const { + assert(Factor >= 2 && "Invalid interleave factor"); + + auto *VecTy = cast<VectorType>(Ty); + if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) { + return InstructionCost::getInvalid(); + } + + if (UseMaskForCond || UseMaskForGaps) + return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices, + Alignment, AddressSpace, CostKind, + UseMaskForCond, UseMaskForGaps); + + constexpr unsigned MaxInterleaveFactor = 4; + if (Factor <= MaxInterleaveFactor) { + unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); + // Ensure the number of vector elements is greater than 1. + if (MinElts < 2 || MinElts % Factor != 0) + return InstructionCost::getInvalid(); + + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + // Ensure the element type is legal. + if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) + return InstructionCost::getInvalid(); + + auto *SubVecTy = + VectorType::get(VecTy->getElementType(), + VecTy->getElementCount().divideCoefficientBy(Factor)); + InstructionCost MemCost = + getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind); + + unsigned VecSize = DL.getTypeSizeInBits(SubVecTy); + unsigned MaxVecSize = 128; + unsigned NumAccesses = + std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize); + + // A stride of two is commonly supported via dedicated instructions, so it + // should be relatively cheap for all element sizes. A stride of four is + // more expensive as it will likely require more shuffles. Using two + // simd128 inputs is considered more expensive and we mainly account for + // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable + // arithmetic kernels. + static const CostTblEntry ShuffleCostTbl[] = { + // One reg. + {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 + {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 + {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 + {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 + {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 + {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 + + // Two regs. + {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8 + {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16 + {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 + + // One reg. + {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 + {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 + {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16 + + // Two regs. + {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8 + {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16 + {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32 + + // Four regs. + {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32 + }; + + EVT ETy = TLI->getValueType(DL, SubVecTy); + if (const auto *Entry = + CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT())) + return Entry->Cost + (NumAccesses * MemCost); + } + + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, CostKind, + UseMaskForCond, UseMaskForGaps); +} + InstructionCost WebAssemblyTTIImpl::getVectorInstrCost( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const { |
