summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp125
1 files changed, 109 insertions, 16 deletions
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 08fb7586d215..0eefd3e2b350 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -166,12 +166,6 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
CostKind);
}
- int ISD = TLI->InstructionOpcodeToISD(Opcode);
- if (ISD != ISD::LOAD) {
- return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
- CostKind);
- }
-
EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
if (VT == MVT::Other)
@@ -182,22 +176,121 @@ InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
if (!LT.first.isValid())
return InstructionCost::getInvalid();
- // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads can
- // be lowered to load32_zero and load64_zero respectively. Assume SIMD loads
- // are twice as expensive as scalar.
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
unsigned width = VT.getSizeInBits();
- switch (width) {
- default:
- break;
- case 32:
- case 64:
- case 128:
- return 2;
+ if (ISD == ISD::LOAD) {
+ // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
+ // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
+ // loads are twice as expensive as scalar.
+ switch (width) {
+ default:
+ break;
+ case 32:
+ case 64:
+ case 128:
+ return 2;
+ }
+ } else if (ISD == ISD::STORE) {
+ // For stores, we can use store lane operations.
+ switch (width) {
+ default:
+ break;
+ case 8:
+ case 16:
+ case 32:
+ case 64:
+ case 128:
+ return 2;
+ }
}
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
}
+InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) const {
+ assert(Factor >= 2 && "Invalid interleave factor");
+
+ auto *VecTy = cast<VectorType>(Ty);
+ if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
+ return InstructionCost::getInvalid();
+ }
+
+ if (UseMaskForCond || UseMaskForGaps)
+ return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+
+ constexpr unsigned MaxInterleaveFactor = 4;
+ if (Factor <= MaxInterleaveFactor) {
+ unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
+ // Ensure the number of vector elements is greater than 1.
+ if (MinElts < 2 || MinElts % Factor != 0)
+ return InstructionCost::getInvalid();
+
+ unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+ // Ensure the element type is legal.
+ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+ return InstructionCost::getInvalid();
+
+ auto *SubVecTy =
+ VectorType::get(VecTy->getElementType(),
+ VecTy->getElementCount().divideCoefficientBy(Factor));
+ InstructionCost MemCost =
+ getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
+
+ unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
+ unsigned MaxVecSize = 128;
+ unsigned NumAccesses =
+ std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);
+
+ // A stride of two is commonly supported via dedicated instructions, so it
+ // should be relatively cheap for all element sizes. A stride of four is
+ // more expensive as it will likely require more shuffles. Using two
+ // simd128 inputs is considered more expensive and we mainly account for
+ // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
+ // arithmetic kernels.
+ static const CostTblEntry ShuffleCostTbl[] = {
+ // One reg.
+ {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
+ {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
+ {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
+ {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
+ {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
+ {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32
+
+ // Two regs.
+ {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
+ {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
+ {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32
+
+ // One reg.
+ {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
+ {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
+ {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16
+
+ // Two regs.
+ {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
+ {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
+ {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
+
+ // Four regs.
+ {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
+ };
+
+ EVT ETy = TLI->getValueType(DL, SubVecTy);
+ if (const auto *Entry =
+ CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
+ return Entry->Cost + (NumAccesses * MemCost);
+ }
+
+ return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace, CostKind,
+ UseMaskForCond, UseMaskForGaps);
+}
+
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
const Value *Op0, const Value *Op1) const {