diff options
| author | Guy David <guyda96@gmail.com> | 2025-10-08 16:21:49 +0300 |
|---|---|---|
| committer | Guy David <guyda@apple.com> | 2025-11-02 02:10:00 +0200 |
| commit | 502edc8a10441495fd0ed48df0b5b88431873ada (patch) | |
| tree | 58d5d6731e04e901b812cbe3fd7247794cbe3aa0 | |
| parent | 8f7efa094e9ca18f714094eaefb011442b124ec3 (diff) | |
[AArch64] Optimize extending loads of small vectorsusers/guy-david/aarch64-ext-load-small-vector-v2
Reduces the total amount of loads and the amount of moves between SIMD
registers and general-purpose registers.
31 files changed, 706 insertions, 389 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 78f63b4406eb..3c43b8576e06 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2386,6 +2386,18 @@ public: return true; } + /// Returns the preferred extension type for a promoted load. + /// This is called during type legalization when promoting loads from + /// illegal types (like v4i8) to legal types (like v4i16). + /// By default returns EXTLOAD (anyext), but targets can override to + /// prefer ZEXTLOAD or SEXTLOAD for specific loads. + /// The LoadSDNode parameter allows the target to check alignment and + /// other properties of the specific load being promoted. + virtual ISD::LoadExtType getPreferredExtendForPromotedLoad(LoadSDNode *N, + EVT LoadVT) const { + return ISD::EXTLOAD; + } + /// Returns how the given (atomic) load should be expanded by the /// IR-level AtomicExpand pass. virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index b1776eaae6e8..9aea8b7b3681 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -993,8 +993,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) { assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!"); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - ISD::LoadExtType ExtType = - ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType(); + ISD::LoadExtType ExtType; + if (ISD::isNON_EXTLoad(N)) { + // For non-extending loads, ask the target what extension type it prefers + ExtType = TLI.getPreferredExtendForPromotedLoad(N, NVT); + } else { + ExtType = N->getExtensionType(); + } SDLoc dl(N); SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(), N->getMemoryVT(), N->getMemOperand()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e993b2..0e2792d519e9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1438,12 +1438,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::v2i16, Custom); setOperationAction(ISD::BITCAST, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); // ADDP custom lowering for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) @@ -6746,8 +6754,19 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend, bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT ExtVT = ExtVal.getValueType(); - if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors()) - return false; + if (!ExtVT.isScalableVector()) { + if (auto *SrcVal = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) { + EVT SrcVT = SrcVal->getValueType(0); + if ((SrcVT == MVT::v2i8 || SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16) && + isTypeLegal(ExtVT) && + allowsMisalignedMemoryAccesses( + SrcVT, SrcVal->getAddressSpace(), SrcVal->getAlign(), + SrcVal->getMemOperand()->getFlags(), nullptr)) + return true; + } + if (!Subtarget->useSVEForFixedLengthVectors()) + return false; + } // It may be worth creating extending masked loads if there are multiple // masked loads using the same predicate. That way we'll end up creating @@ -7228,37 +7247,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, return DAG.getMergeValues({Loaded, Chain}, DL); } - // Custom lowering for extending v4i8 vector loads. - EVT VT = Op->getValueType(0); - assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32"); - - if (LoadNode->getMemoryVT() != MVT::v4i8) - return SDValue(); - - // Avoid generating unaligned loads. - if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4)) - return SDValue(); - - unsigned ExtType; - if (LoadNode->getExtensionType() == ISD::SEXTLOAD) - ExtType = ISD::SIGN_EXTEND; - else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD || - LoadNode->getExtensionType() == ISD::EXTLOAD) - ExtType = ISD::ZERO_EXTEND; - else - return SDValue(); - - SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(), - LoadNode->getBasePtr(), MachinePointerInfo()); - SDValue Chain = Load.getValue(1); - SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load); - SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); - SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC); - Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext, - DAG.getConstant(0, DL, MVT::i64)); - if (VT == MVT::v4i32) - Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext); - return DAG.getMergeValues({Ext, Chain}, DL); + return SDValue(); } SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, @@ -12676,6 +12665,55 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, return IsLegal; } +ISD::LoadExtType +AArch64TargetLowering::getPreferredExtendForPromotedLoad(LoadSDNode *N, + EVT LoadVT) const { + // Only prefer ZEXTLOAD for small integer vector types that will be + // optimized by performSmallVectorLoadExtCombine. We need to match the + // same conditions that function checks to avoid applying ZEXTLOAD when + // the load won't actually be optimized. + + EVT MemVT = N->getMemoryVT(); + + // performSmallVectorLoadExtCombine only handles specific types + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && + MemVT != MVT::v2i16 && MemVT != MVT::v4i16) { + return ISD::EXTLOAD; + } + + // performSmallVectorLoadExtCombine requires NEON + if (!Subtarget->isNeonAvailable()) { + return ISD::EXTLOAD; + } + + // performSmallVectorLoadExtCombine bails out on volatile loads + if (N->isVolatile()) { + return ISD::EXTLOAD; + } + + // Check alignment - performSmallVectorLoadExtCombine requires proper alignment + // when strict alignment is required + Align Alignment = N->getAlign(); + Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue()); + if (Subtarget->requiresStrictAlign() && Alignment < RequiredAlignment) { + // The load won't be optimized by performSmallVectorLoadExtCombine, + // so don't use ZEXTLOAD + return ISD::EXTLOAD; + } + + // For these small integer vector types with proper alignment, + // prefer zero-extending loads to avoid the need for AND masks later. + // This is especially beneficial for the patterns created by + // performSmallVectorLoadExtCombine which converts these to scalar loads + // followed by vector operations. + if (isLoadExtLegal(ISD::ZEXTLOAD, LoadVT, MemVT)) { + return ISD::ZEXTLOAD; + } + + // Default to EXTLOAD for other cases + return ISD::EXTLOAD; +} + //===----------------------------------------------------------------------===// // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// @@ -23300,6 +23338,137 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT)); } +// Helper function to optimize small vector load + extension patterns. +// These patterns would otherwise be scalarized into inefficient sequences. +static SDValue performSmallVectorLoadExtCombine(LoadSDNode *LD, + SelectionDAG &DAG) { + // Don't optimize if NEON is not available. + const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>(); + if (!Subtarget.isNeonAvailable()) + return SDValue(); + + // Don't optimize volatile loads + if (LD->isVolatile()) + return SDValue(); + + EVT MemVT = LD->getMemoryVT(); + EVT ResVT = LD->getValueType(0); + + // Only handle our specific small vector patterns. + if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16 && + MemVT != MVT::v4i16) + return SDValue(); + + unsigned NumElts = ResVT.getVectorNumElements(); + unsigned DstEltBits = ResVT.getScalarSizeInBits(); + + // Check alignment + Align Alignment = LD->getAlign(); + Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue()); + if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment) + return SDValue(); + + unsigned ExtOpcode; + switch (LD->getExtensionType()) { + case ISD::EXTLOAD: + case ISD::ZEXTLOAD: + ExtOpcode = ISD::ZERO_EXTEND; + break; + case ISD::SEXTLOAD: + ExtOpcode = ISD::SIGN_EXTEND; + break; + case ISD::NON_EXTLOAD: + return SDValue(); + } + + SDLoc DL(LD); + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + const MachinePointerInfo &PtrInfo = LD->getPointerInfo(); + + SDValue Load; + SDValue Vec; + + if (MemVT == MVT::v2i8) { + Load = DAG.getLoad(MVT::f16, DL, Chain, BasePtr, PtrInfo, Alignment); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Load); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4f16, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); + if (DstEltBits >= 16) { + Vec = DAG.getNode(ExtOpcode, DL, MVT::v8i16, Vec); + if (DstEltBits >= 32) { + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec); + if (DstEltBits >= 64) { + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec); + } + } + } + } else if (MemVT == MVT::v4i8) { + Load = DAG.getLoad(MVT::f32, DL, Chain, BasePtr, PtrInfo, Alignment); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Load); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f32, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec); + if (DstEltBits >= 16) { + Vec = DAG.getNode(ExtOpcode, DL, MVT::v8i16, Vec); + if (DstEltBits >= 32) { + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec); + if (DstEltBits >= 64) { + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec); + } + } + } + } else if (MemVT == MVT::v2i16) { + Load = DAG.getLoad(MVT::f32, DL, Chain, BasePtr, PtrInfo, Alignment); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Load); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f32, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v4i16, Vec); + if (DstEltBits >= 32) { + Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec); + if (DstEltBits >= 64) { + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec); + } + } + } else if (MemVT == MVT::v4i16) { + Load = DAG.getLoad(MVT::f64, DL, Chain, BasePtr, PtrInfo, Alignment); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, Load); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1f64, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v4i16, Vec); + if (DstEltBits >= 32) { + Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec); + if (DstEltBits >= 64) { + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec, + DAG.getConstant(0, DL, MVT::i64)); + Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec); + } + } + } + + if (Vec.getValueType().getVectorNumElements() != NumElts) { + EVT FinalVT = EVT::getVectorVT( + *DAG.getContext(), Vec.getValueType().getVectorElementType(), NumElts); + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Vec, + DAG.getConstant(0, DL, MVT::i64)); + } + + + + return DAG.getMergeValues({Vec, Load.getValue(1)}, DL); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -24426,6 +24595,10 @@ static SDValue performLOADCombine(SDNode *N, } } + // Try to optimize small vector load + extension patterns + if (SDValue Result = performSmallVectorLoadExtCombine(LD, DAG)) + return Result; + if (LD->isVolatile() || !Subtarget->isLittleEndian()) return SDValue(N, 0); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2cb8ed29f252..90871f86c4b9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -145,6 +145,10 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; + /// Return the preferred extension type for promoted loads. + ISD::LoadExtType getPreferredExtendForPromotedLoad(LoadSDNode *N, + EVT LoadVT) const override; + /// Return true if the given shuffle mask can be codegen'd directly, or if it /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override; diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll index 317feb5ad9ad..bc0edc9b5eca 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll @@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) { define <2 x i16> @test1(ptr %v2i16_ptr) { ; CHECK-LE-LABEL: test1: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-LE-NEXT: add x8, x0, #2 -; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test1: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-BE-NEXT: add x8, x0, #2 -; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %v2i16 = load <2 x i16>, ptr %v2i16_ptr @@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) { define <2 x i8> @test3(ptr %v2i8_ptr) { ; CHECK-LE-LABEL: test3: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-LE-NEXT: add x8, x0, #1 -; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: test3: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-BE-NEXT: add x8, x0, #1 -; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %v2i8 = load <2 x i8>, ptr %v2i8_ptr @@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) { define <2 x i32> @fsext_v2i32(ptr %a) { ; CHECK-LE-LABEL: fsext_v2i32: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] -; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i32: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] -; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, ptr %a @@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) { define <2 x i16> @fsext_v2i16(ptr %a) { ; CHECK-LE-LABEL: fsext_v2i16: ; CHECK-LE: // %bb.0: -; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] -; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i16: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] -; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, ptr %a @@ -497,3 +495,219 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric %v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1 ret <4 x i8> %v4i8 } + +define <2 x i16> @zext_v2i8_v2i16(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i16> + ret <2 x i16> %y +} + +define <2 x i32> @zext_v2i8_v2i32(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @zext_v2i8_v2i64(ptr %a) { +; CHECK-LE-LABEL: zext_v2i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = zext <2 x i8> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i32> @zext_v2i16_v2i32(ptr %a) { +; CHECK-LE-LABEL: zext_v2i16_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i16_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = zext <2 x i16> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @zext_v2i16_v2i64(ptr %a) { +; CHECK-LE-LABEL: zext_v2i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: zext_v2i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = zext <2 x i16> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i16> @sext_v2i8_v2i16(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i16: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i16: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i16> + ret <2 x i16> %y +} + +define <2 x i32> @sext_v2i8_v2i32(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @sext_v2i8_v2i64(ptr %a) { +; CHECK-LE-LABEL: sext_v2i8_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr h0, [x0] +; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i8_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr h0, [x0] +; CHECK-BE-NEXT: rev16 v0.8b, v0.8b +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i8>, ptr %a + %y = sext <2 x i8> %x to <2 x i64> + ret <2 x i64> %y +} + +define <2 x i32> @sext_v2i16_v2i32(ptr %a) { +; CHECK-LE-LABEL: sext_v2i16_v2i32: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i16_v2i32: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v0.2s, v0.2s +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = sext <2 x i16> %x to <2 x i32> + ret <2 x i32> %y +} + +define <2 x i64> @sext_v2i16_v2i64(ptr %a) { +; CHECK-LE-LABEL: sext_v2i16_v2i64: +; CHECK-LE: // %bb.0: +; CHECK-LE-NEXT: ldr s0, [x0] +; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-LE-NEXT: ret +; +; CHECK-BE-LABEL: sext_v2i16_v2i64: +; CHECK-BE: // %bb.0: +; CHECK-BE-NEXT: ldr s0, [x0] +; CHECK-BE-NEXT: rev32 v0.4h, v0.4h +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-BE-NEXT: ret + %x = load <2 x i16>, ptr %a + %y = sext <2 x i16> %x to <2 x i64> + ret <2 x i64> %y +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 0cd885e59981..2cd54d411354 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -222,21 +222,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: ; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: ldrh w8, [x0, #2] -; CHECK-NEON-NEXT: ldr h0, [x0] +; CHECK-NEON-NEXT: ldr s0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] -; CHECK-NEON-NEXT: mov v0.d[1], x8 -; CHECK-NEON-NEXT: xtn v0.2s, v0.2d +; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: ldrh w8, [x0, #2] -; CHECK-SVE-NEXT: ldr h0, [x0] +; CHECK-SVE-NEXT: ldr s0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] -; CHECK-SVE-NEXT: mov v0.d[1], x8 -; CHECK-SVE-NEXT: xtn v0.2s, v0.2d +; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index 96168cb80196..7502db4c5aa9 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -56,13 +56,11 @@ entry: define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] @@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -228,13 +225,9 @@ entry: define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index a7875dbebd0e..d8d003c85eed 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -176,12 +176,12 @@ entry: define void @and_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: and_v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] @@ -212,12 +212,12 @@ entry: define void @or_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: or_v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] @@ -248,12 +248,12 @@ entry: define void @xor_v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: xor_v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] @@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -698,12 +695,10 @@ entry: define void @and_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: and_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] @@ -734,12 +729,10 @@ entry: define void @or_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: or_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] @@ -770,12 +763,10 @@ entry: define void @xor_v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: xor_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll index 8ec5434085d6..bc95111ec427 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -103,12 +103,12 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu_zero_shift_amount define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind { -; CHECK-LABEL: sqshl8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqshl v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sqshl8b: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: sqshl v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: ret %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll index 709a17e32f58..309f31d77ee6 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll @@ -509,7 +509,7 @@ define <4 x i8> @small_vector(<4 x i8> %0) { ; CHECK-NEXT: .seh_add_fp 176 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: str w0, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: blr x9 diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll index 741dcf3ad4c2..8ba462daf14b 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll @@ -13,20 +13,6 @@ define <4 x i16> @z_i32_v4i16(i32 %x) { ; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: z_i32_v4i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmov s0, w0 -; CHECK-GI-NEXT: mov b1, v0.b[1] -; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] -; CHECK-GI-NEXT: mov b3, v0.b[2] -; CHECK-GI-NEXT: mov b0, v0.b[3] -; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] -; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] -; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] -; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret %b = bitcast i32 %x to <4 x i8> %e = zext <4 x i8> %b to <4 x i16> ret <4 x i16> %e diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index 20f19fddf790..002e6cd509be 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: add x8, sp, #12 ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: str s0, [sp, #12] -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x8] -; CHECK-SD-NEXT: orr x8, x8, #0x2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 04124609eec7..b1b869ec9e1f 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -6,11 +6,10 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x0, #1] +; CHECK-SD-NEXT: ldr h1, [x0] ; CHECK-SD-NEXT: movi v0.2s, #24 -; CHECK-SD-NEXT: fmov s1, w8 -; CHECK-SD-NEXT: mov v1.s[1], w9 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] @@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -145,11 +143,9 @@ entry: define void @v2i16(ptr %p1) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x0, #2] +; CHECK-SD-NEXT: ldr s1, [x0] ; CHECK-SD-NEXT: movi v0.2s, #16 -; CHECK-SD-NEXT: fmov s1, w8 -; CHECK-SD-NEXT: mov v1.s[1], w9 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: clz v1.2s, v1.2s ; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index d547b6bec5b8..9c59f1b233b5 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -6,10 +6,9 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x0, #1] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h @@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -143,10 +141,8 @@ entry: define void @v2i16(ptr %p1) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x0, #2] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: mov v0.s[1], w9 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: cnt v0.8b, v0.8b ; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b ; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index fc9bf2c0aca6..c9181b4c312d 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -6,10 +6,10 @@ define void @v2i8(ptr %p1) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #1 +; CHECK-SD-NEXT: ldr h0, [x0] ; CHECK-SD-NEXT: movi v1.2s, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: orr v0.2s, #1, lsl #8 ; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b @@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) { ; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -219,10 +218,9 @@ entry: define void @v2i16(ptr %p1) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #2 +; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: movi v1.2s, #1 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: orr v0.2s, #1, lsl #16 ; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index cabb0e7278e4..d18cff51c610 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) { define <2 x i16> @std_v2i8_v2i16(ptr %p) { ; CHECK-LABEL: std_v2i8_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrb w9, [x0, #3] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: ldr h0, [x0, #2] +; CHECK-NEXT: ldr h1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: shl v0.2s, v0.2s, #3 -; CHECK-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %l1 = load <2 x i8>, ptr %p %q = getelementptr i8, ptr %p, i32 2 @@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldr s1, [x0, #4] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ldr s0, [x0, #4] +; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #3 -; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %l1b = load volatile float, ptr %p diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index c4bb6e37d6ea..b138fa408542 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) { define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) { ; CHECK-SD-LABEL: load_v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; @@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) { define <2 x i16> @load_v2i16(ptr %ptr) { ; CHECK-SD-LABEL: load_v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 9c69a6f03b85..475bd22c6ebc 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -68,13 +68,11 @@ entry: define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] @@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -240,13 +237,9 @@ entry: define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 1c4a504d0ab7..9e321bbecb80 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -115,8 +115,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-SD-NEXT: sqadd v0.4h, v0.4h, v1.4h @@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s @@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll index 3e708b0678fb..297b25ed075e 100644 --- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll @@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) { ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: .LBB3_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8, lsl #1 -; CHECK-NEXT: ldrsb w10, [x9] -; CHECK-NEXT: ldrsb w9, [x9, #1] -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ldr h0, [x0, x8, lsl #1] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: str q0, [x1, x8, lsl #4] diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index 3af858713525..a30e9045c6a2 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -115,8 +115,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-SD-NEXT: sqsub v0.4h, v0.4h, v1.4h @@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s @@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h ; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 3a9f12b83870..1dc55fccc3da 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){ ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: strb w2, [x3, #2] ; CHECK-SD-NEXT: mov v0.h[1], w1 ; CHECK-SD-NEXT: mov v0.h[2], w2 ; CHECK-SD-NEXT: xtn v0.8b, v0.8h -; CHECK-SD-NEXT: str s0, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] -; CHECK-SD-NEXT: strb w2, [x3, #2] -; CHECK-SD-NEXT: strh w8, [x3] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: str h0, [x3] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 5e278d59b659..dd920b98e18e 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -56,13 +56,11 @@ entry: define void @v2i8(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #1 -; CHECK-SD-NEXT: add x9, x1, #1 -; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x0] ; CHECK-SD-NEXT: stur b1, [x0, #1] @@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) { ; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: mov h0, v0.h[2] -; CHECK-SD-NEXT: str s1, [sp, #12] -; CHECK-SD-NEXT: ldrh w8, [sp, #12] +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: stur b0, [x0, #2] -; CHECK-SD-NEXT: strh w8, [x0] +; CHECK-SD-NEXT: str h1, [x0] ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret ; @@ -228,13 +225,9 @@ entry: define void @v2i16(ptr %p1, ptr %p2) { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0] -; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1] -; CHECK-SD-NEXT: add x8, x0, #2 -; CHECK-SD-NEXT: add x9, x1, #2 -; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x0] ; CHECK-SD-NEXT: str h1, [x0, #2] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 6fd5b820a224..b457e0307fbe 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 @@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index ed03f9b32243..4fb3bf7392d4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: ldr h0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 @@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: sshll v1.2d, v1.2s, #0 diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 3cfb24aaccb1..cd02d18e6164 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x1] +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] ; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff -; CHECK-SD-NEXT: ldrb w10, [x0, #1] -; CHECK-SD-NEXT: ldrb w11, [x1, #1] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] @@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x1] +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] ; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-SD-NEXT: ldrh w10, [x0, #2] -; CHECK-SD-NEXT: ldrh w11, [x1, #2] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 -; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll index a71cf95a728d..ef70137e6dee 100644 --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: ldrb w9, [x1] -; CHECK-SD-NEXT: ldrb w10, [x0, #1] -; CHECK-SD-NEXT: ldrb w11, [x1, #1] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: ldr h1, [x1] +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str b0, [x2] @@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind { define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v2i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: ldrh w9, [x1] -; CHECK-SD-NEXT: ldrh w10, [x0, #2] -; CHECK-SD-NEXT: ldrh w11, [x1, #2] -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: fmov s1, w9 -; CHECK-SD-NEXT: mov v0.s[1], w10 -; CHECK-SD-NEXT: mov v1.s[1], w11 +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-SD-NEXT: mov s1, v0.s[1] ; CHECK-SD-NEXT: str h0, [x2] diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll index f6553b6acec9..6d4061fb02cf 100644 --- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll +++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll @@ -1,9 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s -; CHECK-LABEL: convert_v3f32 -; CHECK: strb -; CHECK: strh define void @convert_v3f32() { +; CHECK-LABEL: convert_v3f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: str wzr, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: strb wzr, [x8] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret entry: br label %bb diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 80029fb71757..ee74984125f7 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { ; CHECK-SD-NEXT: shl.16b v0, v0, #7 ; CHECK-SD-NEXT: adrp x8, lCPI20_0@PAGE ; CHECK-SD-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] -; CHECK-SD-NEXT: add x8, sp, #14 ; CHECK-SD-NEXT: cmlt.16b v0, v0, #0 ; CHECK-SD-NEXT: and.16b v0, v0, v1 ; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 ; CHECK-SD-NEXT: zip1.16b v0, v0, v1 ; CHECK-SD-NEXT: addv.8h h0, v0 -; CHECK-SD-NEXT: str h0, [sp, #14] -; CHECK-SD-NEXT: ld1.b { v0 }[0], [x8] -; CHECK-SD-NEXT: orr x8, x8, #0x1 -; CHECK-SD-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-SD-NEXT: ushll.8h v0, v0, #0 +; CHECK-SD-NEXT: ushll.4s v0, v0, #0 ; CHECK-SD-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: add sp, sp, #16 ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 7d3f5bc270d6..a5a26c185fdb 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -338,7 +338,7 @@ define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) { ; BE-NEXT: add x8, x0, #2 ; BE-NEXT: ldr s0, [sp, #12] ; BE-NEXT: rev32 v0.8b, v0.8b -; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: ld1 { v0.b }[4], [x8] ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: shl v0.4s, v0.4s, #24 @@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: ldr s0, [x0] ; BE-NEXT: ldrh w8, [x0, #4] ; BE-NEXT: rev32 v0.4h, v0.4h +; BE-NEXT: strb w8, [x1, #2] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; BE-NEXT: rev32 v0.16b, v0.16b -; BE-NEXT: str s0, [sp, #12] -; BE-NEXT: ldrh w9, [sp, #12] -; BE-NEXT: strb w8, [x1, #2] -; BE-NEXT: strh w9, [x1] +; BE-NEXT: rev32 v0.4h, v0.4h +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: str h0, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret entry: @@ -459,7 +459,7 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; BE-NEXT: add x8, x0, #2 ; BE-NEXT: ldr s0, [sp, #12] ; BE-NEXT: rev32 v0.8b, v0.8b -; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: ld1 { v0.b }[4], [x8] ; BE-NEXT: bic v0.4h, #255, lsl #8 ; BE-NEXT: rev32 v1.8h, v0.8h @@ -562,7 +562,7 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; BE-NEXT: add x8, x0, #2 ; BE-NEXT: ldr s0, [sp, #12] ; BE-NEXT: rev32 v0.8b, v0.8b -; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: ld1 { v0.b }[4], [x8] ; BE-NEXT: adrp x8, .LCPI15_0 ; BE-NEXT: add x8, x8, :lo12:.LCPI15_0 @@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #3] -; BE-NEXT: sturh w8, [x1, #1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: stur h1, [x1, #1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #5] -; BE-NEXT: sturh w8, [x1, #3] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: stur h1, [x1, #3] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; CHECK-NEXT: shrn.4h v0, v0, #16 ; CHECK-NEXT: uzp1.8b v1, v0, v0 ; CHECK-NEXT: mov h0, v0[2] -; CHECK-NEXT: str s1, [sp, #12] -; CHECK-NEXT: ldrh w8, [sp, #12] +; CHECK-NEXT: ushll.4s v1, v1, #0 ; CHECK-NEXT: stur b0, [x1, #2] -; CHECK-NEXT: strh w8, [x1] +; CHECK-NEXT: str h1, [x1] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; @@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #12] -; BE-NEXT: ldrh w8, [sp, #12] ; BE-NEXT: stur b0, [x1, #2] -; BE-NEXT: strh w8, [x1] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x1] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i32>, ptr %src @@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #8] -; BE-NEXT: ldrh w8, [sp, #8] ; BE-NEXT: stur b0, [x0, #2] -; BE-NEXT: strh w8, [x0] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x0] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i8>, ptr %src, align 1 @@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b ; BE-NEXT: mov h0, v0.h[2] ; BE-NEXT: rev32 v1.16b, v1.16b -; BE-NEXT: str s1, [sp, #8] -; BE-NEXT: ldrh w8, [sp, #8] ; BE-NEXT: stur b0, [x0, #2] -; BE-NEXT: strh w8, [x0] +; BE-NEXT: rev32 v1.4h, v1.4h +; BE-NEXT: ushll v1.4s, v1.4h, #0 +; BE-NEXT: str h1, [x0] ; BE-NEXT: add sp, sp, #16 ; BE-NEXT: ret %l = load <3 x i8>, ptr %src, align 1 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 74a717f1635a..7cba0d608cd4 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1012,18 +1012,16 @@ define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) { ; ; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: adrp x8, .LCPI11_0 -; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI11_0 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] ; CHECK-BE-NEXT: mov x8, xzr ; CHECK-BE-NEXT: .LBB11_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldr s1, [x0, x8] +; CHECK-BE-NEXT: ldr s0, [x0, x8] ; CHECK-BE-NEXT: add x8, x8, #16 ; CHECK-BE-NEXT: cmp x8, #128 -; CHECK-BE-NEXT: rev32 v1.16b, v1.16b -; CHECK-BE-NEXT: tbl v1.16b, { v1.16b }, v0.16b -; CHECK-BE-NEXT: st1 { v1.16b }, [x1] +; CHECK-BE-NEXT: rev32 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: st1 { v0.4s }, [x1] ; CHECK-BE-NEXT: add x1, x1, #64 ; CHECK-BE-NEXT: b.ne .LBB11_1 ; CHECK-BE-NEXT: // %bb.2: // %exit |
