summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGuy David <guyda96@gmail.com>2025-10-08 16:21:49 +0300
committerGuy David <guyda@apple.com>2025-11-02 02:10:00 +0200
commit502edc8a10441495fd0ed48df0b5b88431873ada (patch)
tree58d5d6731e04e901b812cbe3fd7247794cbe3aa0
parent8f7efa094e9ca18f714094eaefb011442b124ec3 (diff)
[AArch64] Optimize extending loads of small vectorsusers/guy-david/aarch64-ext-load-small-vector-v2
Reduces the total amount of loads and the amount of moves between SIMD registers and general-purpose registers.
-rw-r--r--llvm/include/llvm/CodeGen/TargetLowering.h12
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp251
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h4
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-load-ext.ll270
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-smull.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/add.ll27
-rw-r--r--llvm/test/CodeGen/AArch64/andorxor.ll81
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vshift.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/bitcast-extend.ll14
-rw-r--r--llvm/test/CodeGen/AArch64/bitcast.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/ctlz.ll18
-rw-r--r--llvm/test/CodeGen/AArch64/ctpop.ll18
-rw-r--r--llvm/test/CodeGen/AArch64/cttz.ll16
-rw-r--r--llvm/test/CodeGen/AArch64/extbinopload.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/load.ll11
-rw-r--r--llvm/test/CodeGen/AArch64/mul.ll27
-rw-r--r--llvm/test/CodeGen/AArch64/sadd_sat_vec.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll8
-rw-r--r--llvm/test/CodeGen/AArch64/ssub_sat_vec.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/store.ll7
-rw-r--r--llvm/test/CodeGen/AArch64/sub.ll27
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll13
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll13
-rw-r--r--llvm/test/CodeGen/AArch64/uadd_sat_vec.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/usub_sat_vec.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/v3f-to-int.ll15
-rw-r--r--llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll7
-rw-r--r--llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll73
-rw-r--r--llvm/test/CodeGen/AArch64/zext-to-tbl.ll12
31 files changed, 706 insertions, 389 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 78f63b4406eb..3c43b8576e06 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2386,6 +2386,18 @@ public:
return true;
}
+ /// Returns the preferred extension type for a promoted load.
+ /// This is called during type legalization when promoting loads from
+ /// illegal types (like v4i8) to legal types (like v4i16).
+ /// By default returns EXTLOAD (anyext), but targets can override to
+ /// prefer ZEXTLOAD or SEXTLOAD for specific loads.
+ /// The LoadSDNode parameter allows the target to check alignment and
+ /// other properties of the specific load being promoted.
+ virtual ISD::LoadExtType getPreferredExtendForPromotedLoad(LoadSDNode *N,
+ EVT LoadVT) const {
+ return ISD::EXTLOAD;
+ }
+
/// Returns how the given (atomic) load should be expanded by the
/// IR-level AtomicExpand pass.
virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1776eaae6e8..9aea8b7b3681 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -993,8 +993,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
- ISD::LoadExtType ExtType =
- ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
+ ISD::LoadExtType ExtType;
+ if (ISD::isNON_EXTLoad(N)) {
+ // For non-extending loads, ask the target what extension type it prefers
+ ExtType = TLI.getPreferredExtendForPromotedLoad(N, NVT);
+ } else {
+ ExtType = N->getExtensionType();
+ }
SDLoc dl(N);
SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
N->getMemoryVT(), N->getMemOperand());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b2..0e2792d519e9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1438,12 +1438,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
// ADDP custom lowering
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6746,8 +6754,19 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT ExtVT = ExtVal.getValueType();
- if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
- return false;
+ if (!ExtVT.isScalableVector()) {
+ if (auto *SrcVal = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
+ EVT SrcVT = SrcVal->getValueType(0);
+ if ((SrcVT == MVT::v2i8 || SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16) &&
+ isTypeLegal(ExtVT) &&
+ allowsMisalignedMemoryAccesses(
+ SrcVT, SrcVal->getAddressSpace(), SrcVal->getAlign(),
+ SrcVal->getMemOperand()->getFlags(), nullptr))
+ return true;
+ }
+ if (!Subtarget->useSVEForFixedLengthVectors())
+ return false;
+ }
// It may be worth creating extending masked loads if there are multiple
// masked loads using the same predicate. That way we'll end up creating
@@ -7228,37 +7247,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
return DAG.getMergeValues({Loaded, Chain}, DL);
}
- // Custom lowering for extending v4i8 vector loads.
- EVT VT = Op->getValueType(0);
- assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
-
- if (LoadNode->getMemoryVT() != MVT::v4i8)
- return SDValue();
-
- // Avoid generating unaligned loads.
- if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
- return SDValue();
-
- unsigned ExtType;
- if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
- ExtType = ISD::SIGN_EXTEND;
- else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
- LoadNode->getExtensionType() == ISD::EXTLOAD)
- ExtType = ISD::ZERO_EXTEND;
- else
- return SDValue();
-
- SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
- LoadNode->getBasePtr(), MachinePointerInfo());
- SDValue Chain = Load.getValue(1);
- SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
- SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
- SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
- Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
- DAG.getConstant(0, DL, MVT::i64));
- if (VT == MVT::v4i32)
- Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
- return DAG.getMergeValues({Ext, Chain}, DL);
+ return SDValue();
}
SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
@@ -12676,6 +12665,55 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
return IsLegal;
}
+ISD::LoadExtType
+AArch64TargetLowering::getPreferredExtendForPromotedLoad(LoadSDNode *N,
+ EVT LoadVT) const {
+ // Only prefer ZEXTLOAD for small integer vector types that will be
+ // optimized by performSmallVectorLoadExtCombine. We need to match the
+ // same conditions that function checks to avoid applying ZEXTLOAD when
+ // the load won't actually be optimized.
+
+ EVT MemVT = N->getMemoryVT();
+
+ // performSmallVectorLoadExtCombine only handles specific types
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 &&
+ MemVT != MVT::v2i16 && MemVT != MVT::v4i16) {
+ return ISD::EXTLOAD;
+ }
+
+ // performSmallVectorLoadExtCombine requires NEON
+ if (!Subtarget->isNeonAvailable()) {
+ return ISD::EXTLOAD;
+ }
+
+ // performSmallVectorLoadExtCombine bails out on volatile loads
+ if (N->isVolatile()) {
+ return ISD::EXTLOAD;
+ }
+
+ // Check alignment - performSmallVectorLoadExtCombine requires proper alignment
+ // when strict alignment is required
+ Align Alignment = N->getAlign();
+ Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
+ if (Subtarget->requiresStrictAlign() && Alignment < RequiredAlignment) {
+ // The load won't be optimized by performSmallVectorLoadExtCombine,
+ // so don't use ZEXTLOAD
+ return ISD::EXTLOAD;
+ }
+
+ // For these small integer vector types with proper alignment,
+ // prefer zero-extending loads to avoid the need for AND masks later.
+ // This is especially beneficial for the patterns created by
+ // performSmallVectorLoadExtCombine which converts these to scalar loads
+ // followed by vector operations.
+ if (isLoadExtLegal(ISD::ZEXTLOAD, LoadVT, MemVT)) {
+ return ISD::ZEXTLOAD;
+ }
+
+ // Default to EXTLOAD for other cases
+ return ISD::EXTLOAD;
+}
+
//===----------------------------------------------------------------------===//
// AArch64 Optimization Hooks
//===----------------------------------------------------------------------===//
@@ -23300,6 +23338,137 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
}
+// Helper function to optimize small vector load + extension patterns.
+// These patterns would otherwise be scalarized into inefficient sequences.
+static SDValue performSmallVectorLoadExtCombine(LoadSDNode *LD,
+ SelectionDAG &DAG) {
+ // Don't optimize if NEON is not available.
+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ if (!Subtarget.isNeonAvailable())
+ return SDValue();
+
+ // Don't optimize volatile loads
+ if (LD->isVolatile())
+ return SDValue();
+
+ EVT MemVT = LD->getMemoryVT();
+ EVT ResVT = LD->getValueType(0);
+
+ // Only handle our specific small vector patterns.
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16 &&
+ MemVT != MVT::v4i16)
+ return SDValue();
+
+ unsigned NumElts = ResVT.getVectorNumElements();
+ unsigned DstEltBits = ResVT.getScalarSizeInBits();
+
+ // Check alignment
+ Align Alignment = LD->getAlign();
+ Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
+ if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
+ return SDValue();
+
+ unsigned ExtOpcode;
+ switch (LD->getExtensionType()) {
+ case ISD::EXTLOAD:
+ case ISD::ZEXTLOAD:
+ ExtOpcode = ISD::ZERO_EXTEND;
+ break;
+ case ISD::SEXTLOAD:
+ ExtOpcode = ISD::SIGN_EXTEND;
+ break;
+ case ISD::NON_EXTLOAD:
+ return SDValue();
+ }
+
+ SDLoc DL(LD);
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ const MachinePointerInfo &PtrInfo = LD->getPointerInfo();
+
+ SDValue Load;
+ SDValue Vec;
+
+ if (MemVT == MVT::v2i8) {
+ Load = DAG.getLoad(MVT::f16, DL, Chain, BasePtr, PtrInfo, Alignment);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Load);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4f16, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
+ if (DstEltBits >= 16) {
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v8i16, Vec);
+ if (DstEltBits >= 32) {
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec);
+ if (DstEltBits >= 64) {
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec);
+ }
+ }
+ }
+ } else if (MemVT == MVT::v4i8) {
+ Load = DAG.getLoad(MVT::f32, DL, Chain, BasePtr, PtrInfo, Alignment);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Load);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f32, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
+ if (DstEltBits >= 16) {
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v8i16, Vec);
+ if (DstEltBits >= 32) {
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec);
+ if (DstEltBits >= 64) {
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec);
+ }
+ }
+ }
+ } else if (MemVT == MVT::v2i16) {
+ Load = DAG.getLoad(MVT::f32, DL, Chain, BasePtr, PtrInfo, Alignment);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Load);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f32, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v4i16, Vec);
+ if (DstEltBits >= 32) {
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec);
+ if (DstEltBits >= 64) {
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec);
+ }
+ }
+ } else if (MemVT == MVT::v4i16) {
+ Load = DAG.getLoad(MVT::f64, DL, Chain, BasePtr, PtrInfo, Alignment);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, Load);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1f64, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ISD::BITCAST, DL, MVT::v4i16, Vec);
+ if (DstEltBits >= 32) {
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v4i32, Vec);
+ if (DstEltBits >= 64) {
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ Vec = DAG.getNode(ExtOpcode, DL, MVT::v2i64, Vec);
+ }
+ }
+ }
+
+ if (Vec.getValueType().getVectorNumElements() != NumElts) {
+ EVT FinalVT = EVT::getVectorVT(
+ *DAG.getContext(), Vec.getValueType().getVectorElementType(), NumElts);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+
+
+
+ return DAG.getMergeValues({Vec, Load.getValue(1)}, DL);
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -24426,6 +24595,10 @@ static SDValue performLOADCombine(SDNode *N,
}
}
+ // Try to optimize small vector load + extension patterns
+ if (SDValue Result = performSmallVectorLoadExtCombine(LD, DAG))
+ return Result;
+
if (LD->isVolatile() || !Subtarget->isLittleEndian())
return SDValue(N, 0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed29f252..90871f86c4b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -145,6 +145,10 @@ public:
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
+ /// Return the preferred extension type for promoted loads.
+ ISD::LoadExtType getPreferredExtendForPromotedLoad(LoadSDNode *N,
+ EVT LoadVT) const override;
+
/// Return true if the given shuffle mask can be codegen'd directly, or if it
/// should be stack expanded.
bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad..bc0edc9b5eca 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) {
define <2 x i16> @test1(ptr %v2i16_ptr) {
; CHECK-LE-LABEL: test1:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #2
-; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test1:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #2
-; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) {
define <2 x i8> @test3(ptr %v2i8_ptr) {
; CHECK-LE-LABEL: test3:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #1
-; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test3:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #1
-; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) {
define <2 x i32> @fsext_v2i32(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i32:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i32:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) {
define <2 x i16> @fsext_v2i16(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i16:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i16:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -497,3 +495,219 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric
%v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
ret <4 x i8> %v4i8
}
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i16> @sext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @sext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 0cd885e59981..2cd54d411354 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,21 +222,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
-; CHECK-NEON-NEXT: ldr h0, [x0]
+; CHECK-NEON-NEXT: ldr s0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: mov v0.d[1], x8
-; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
-; CHECK-SVE-NEXT: ldr h0, [x0]
+; CHECK-SVE-NEXT: ldr s0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: mov v0.d[1], x8
-; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index 96168cb80196..7502db4c5aa9 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -56,13 +56,11 @@ entry:
define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -228,13 +225,9 @@ entry:
define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index a7875dbebd0e..d8d003c85eed 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -176,12 +176,12 @@ entry:
define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: and_v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
@@ -212,12 +212,12 @@ entry:
define void @or_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: or_v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
@@ -248,12 +248,12 @@ entry:
define void @xor_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: xor_v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
@@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -698,12 +695,10 @@ entry:
define void @and_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: and_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
@@ -734,12 +729,10 @@ entry:
define void @or_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: or_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
@@ -770,12 +763,10 @@ entry:
define void @xor_v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: xor_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
index 8ec5434085d6..bc95111ec427 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -103,12 +103,12 @@
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for sqshlu_zero_shift_amount
define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: sqshl8b:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: sqshl v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: sqshl8b:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1]
+; CHECK-SD-NEXT: sqshl v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 709a17e32f58..309f31d77ee6 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -509,7 +509,7 @@ define <4 x i8> @small_vector(<4 x i8> %0) {
; CHECK-NEXT: .seh_add_fp 176
; CHECK-NEXT: .seh_endprologue
; CHECK-NEXT: str w0, [sp, #12]
-; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: ldr s0, [sp, #12]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: blr x9
diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index 741dcf3ad4c2..8ba462daf14b 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -13,20 +13,6 @@ define <4 x i16> @z_i32_v4i16(i32 %x) {
; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: z_i32_v4i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: mov b1, v0.b[1]
-; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
-; CHECK-GI-NEXT: mov b3, v0.b[2]
-; CHECK-GI-NEXT: mov b0, v0.b[3]
-; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
-; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
-; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
%b = bitcast i32 %x to <4 x i8>
%e = zext <4 x i8> %b to <4 x i16>
ret <4 x i16> %e
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 20f19fddf790..002e6cd509be 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: add x8, sp, #12
; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: str s0, [sp, #12]
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x8]
-; CHECK-SD-NEXT: orr x8, x8, #0x2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 04124609eec7..b1b869ec9e1f 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -6,11 +6,10 @@
define void @v2i8(ptr %p1) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x0, #1]
+; CHECK-SD-NEXT: ldr h1, [x0]
; CHECK-SD-NEXT: movi v0.2s, #24
-; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: mov v1.s[1], w9
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
@@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -145,11 +143,9 @@ entry:
define void @v2i16(ptr %p1) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x0, #2]
+; CHECK-SD-NEXT: ldr s1, [x0]
; CHECK-SD-NEXT: movi v0.2s, #16
-; CHECK-SD-NEXT: fmov s1, w8
-; CHECK-SD-NEXT: mov v1.s[1], w9
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index d547b6bec5b8..9c59f1b233b5 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -6,10 +6,9 @@
define void @v2i8(ptr %p1) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x0, #1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: mov v0.s[1], w9
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
@@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -143,10 +141,8 @@ entry:
define void @v2i16(ptr %p1) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x0, #2]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: mov v0.s[1], w9
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index fc9bf2c0aca6..c9181b4c312d 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -6,10 +6,10 @@
define void @v2i8(ptr %p1) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #1
+; CHECK-SD-NEXT: ldr h0, [x0]
; CHECK-SD-NEXT: movi v1.2s, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: orr v0.2s, #1, lsl #8
; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b
@@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -219,10 +218,9 @@ entry:
define void @v2i16(ptr %p1) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #2
+; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: movi v1.2s, #1
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: orr v0.2s, #1, lsl #16
; CHECK-SD-NEXT: sub v1.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: bic v0.8b, v1.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7278e4..d18cff51c610 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) {
define <2 x i16> @std_v2i8_v2i16(ptr %p) {
; CHECK-LABEL: std_v2i8_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrb w9, [x0, #3]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: mov v1.s[1], w9
+; CHECK-NEXT: ldr h0, [x0, #2]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.2s, v0.2s, #3
-; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%l1 = load <2 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 2
@@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) {
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: ldr s0, [x0]
-; CHECK-NEXT: ldr s1, [x0, #4]
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ldr s0, [x0, #4]
+; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #3
-; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #3
+; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%l1b = load volatile float, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e37d6ea..b138fa408542 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) {
define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
; CHECK-SD-LABEL: load_v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: ret
;
@@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) {
define <2 x i16> @load_v2i16(ptr %ptr) {
; CHECK-SD-LABEL: load_v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9c69a6f03b85..475bd22c6ebc 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -68,13 +68,11 @@ entry:
define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: stur b1, [x0, #1]
@@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -240,13 +237,9 @@ entry:
define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 1c4a504d0ab7..9e321bbecb80 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -115,8 +115,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
; CHECK-SD-NEXT: sqadd v0.4h, v0.4h, v1.4h
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
index 3e708b0678fb..297b25ed075e 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
@@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: .LBB3_1: // %loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #1
-; CHECK-NEXT: ldrsb w10, [x9]
-; CHECK-NEXT: ldrsb w9, [x9, #1]
-; CHECK-NEXT: fmov s0, w10
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ldr h0, [x0, x8, lsl #1]
+; CHECK-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: scvtf v0.2d, v0.2d
; CHECK-NEXT: str q0, [x1, x8, lsl #4]
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 3af858713525..a30e9045c6a2 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -115,8 +115,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: ldr s0, [x0]
; CHECK-SD-NEXT: ldr s1, [x1]
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
; CHECK-SD-NEXT: shl v1.4h, v1.4h, #8
; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8
; CHECK-SD-NEXT: sqsub v0.4h, v0.4h, v1.4h
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #24
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT: zip1 v1.4h, v1.4h, v1.4h
; CHECK-SD-NEXT: shl v1.2s, v1.2s, #16
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #16
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 3a9f12b83870..1dc55fccc3da 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: strb w2, [x3, #2]
; CHECK-SD-NEXT: mov v0.h[1], w1
; CHECK-SD-NEXT: mov v0.h[2], w2
; CHECK-SD-NEXT: xtn v0.8b, v0.8h
-; CHECK-SD-NEXT: str s0, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
-; CHECK-SD-NEXT: strb w2, [x3, #2]
-; CHECK-SD-NEXT: strh w8, [x3]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: str h0, [x3]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 5e278d59b659..dd920b98e18e 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -56,13 +56,11 @@ entry:
define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: add x9, x1, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; CHECK-SD-NEXT: mov h0, v0.h[2]
-; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: stur b0, [x0, #2]
-; CHECK-SD-NEXT: strh w8, [x0]
+; CHECK-SD-NEXT: str h1, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -228,13 +225,9 @@ entry:
define void @v2i16(ptr %p1, ptr %p2) {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT: ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT: add x8, x0, #2
-; CHECK-SD-NEXT: add x9, x1, #2
-; CHECK-SD-NEXT: ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT: ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: usubl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x0]
; CHECK-SD-NEXT: str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 6fd5b820a224..b457e0307fbe 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: ldrb w9, [x0, #1]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
@@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_gather_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ed03f9b32243..4fb3bf7392d4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: ldrb w9, [x0, #1]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: sshll v1.2d, v1.2s, #0
@@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: masked_scatter_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w8, [x0]
-; CHECK-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: mov v0.s[1], w9
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: cmeq v1.2s, v0.2s, #0
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: sshll v1.2d, v1.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 3cfb24aaccb1..cd02d18e6164 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x1]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
; CHECK-SD-NEXT: movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT: ldrb w10, [x0, #1]
-; CHECK-SD-NEXT: ldrb w11, [x1, #1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
@@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x1]
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
; CHECK-SD-NEXT: movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT: ldrh w10, [x0, #2]
-; CHECK-SD-NEXT: ldrh w11, [x1, #2]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
-; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a71cf95a728d..ef70137e6dee 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: ldrb w9, [x1]
-; CHECK-SD-NEXT: ldrb w10, [x0, #1]
-; CHECK-SD-NEXT: ldrb w11, [x1, #1]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ldr h1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
@@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-LABEL: v2i16:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: ldrh w9, [x1]
-; CHECK-SD-NEXT: ldrh w10, [x0, #2]
-; CHECK-SD-NEXT: ldrh w11, [x1, #2]
-; CHECK-SD-NEXT: fmov s0, w8
-; CHECK-SD-NEXT: fmov s1, w9
-; CHECK-SD-NEXT: mov v0.s[1], w10
-; CHECK-SD-NEXT: mov v1.s[1], w11
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index f6553b6acec9..6d4061fb02cf 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
-; CHECK-LABEL: convert_v3f32
-; CHECK: strb
-; CHECK: strh
define void @convert_v3f32() {
+; CHECK-LABEL: convert_v3f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: str wzr, [sp, #12]
+; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: strb wzr, [x8]
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: str h0, [x8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
entry:
br label %bb
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 80029fb71757..ee74984125f7 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
; CHECK-SD-NEXT: shl.16b v0, v0, #7
; CHECK-SD-NEXT: adrp x8, lCPI20_0@PAGE
; CHECK-SD-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF]
-; CHECK-SD-NEXT: add x8, sp, #14
; CHECK-SD-NEXT: cmlt.16b v0, v0, #0
; CHECK-SD-NEXT: and.16b v0, v0, v1
; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8
; CHECK-SD-NEXT: zip1.16b v0, v0, v1
; CHECK-SD-NEXT: addv.8h h0, v0
-; CHECK-SD-NEXT: str h0, [sp, #14]
-; CHECK-SD-NEXT: ld1.b { v0 }[0], [x8]
-; CHECK-SD-NEXT: orr x8, x8, #0x1
-; CHECK-SD-NEXT: ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT: ushll.8h v0, v0, #0
+; CHECK-SD-NEXT: ushll.4s v0, v0, #0
; CHECK-SD-NEXT: ; kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 7d3f5bc270d6..a5a26c185fdb 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -338,7 +338,7 @@ define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: ushll v0.4s, v0.4h, #0
; BE-NEXT: shl v0.4s, v0.4s, #24
@@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: ldr s0, [x0]
; BE-NEXT: ldrh w8, [x0, #4]
; BE-NEXT: rev32 v0.4h, v0.4h
+; BE-NEXT: strb w8, [x1, #2]
; BE-NEXT: mov v0.h[2], w8
; BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; BE-NEXT: rev32 v0.16b, v0.16b
-; BE-NEXT: str s0, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: rev32 v0.4h, v0.4h
+; BE-NEXT: ushll v0.4s, v0.4h, #0
+; BE-NEXT: str h0, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -459,7 +459,7 @@ define void @load_ext_to_64bits(ptr %src, ptr %dst) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: bic v0.4h, #255, lsl #8
; BE-NEXT: rev32 v1.8h, v0.8h
@@ -562,7 +562,7 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
; BE-NEXT: add x8, x0, #2
; BE-NEXT: ldr s0, [sp, #12]
; BE-NEXT: rev32 v0.8b, v0.8b
-; BE-NEXT: ushll v0.8h, v0.8b, #0
+; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b
; BE-NEXT: ld1 { v0.b }[4], [x8]
; BE-NEXT: adrp x8, .LCPI15_0
; BE-NEXT: add x8, x8, :lo12:.LCPI15_0
@@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #3]
-; BE-NEXT: sturh w8, [x1, #1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: stur h1, [x1, #1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #5]
-; BE-NEXT: sturh w8, [x1, #3]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: stur h1, [x1, #3]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; CHECK-NEXT: shrn.4h v0, v0, #16
; CHECK-NEXT: uzp1.8b v1, v0, v0
; CHECK-NEXT: mov h0, v0[2]
-; CHECK-NEXT: str s1, [sp, #12]
-; CHECK-NEXT: ldrh w8, [sp, #12]
+; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: stur b0, [x1, #2]
-; CHECK-NEXT: strh w8, [x1]
+; CHECK-NEXT: str h1, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
@@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w8, [sp, #12]
; BE-NEXT: stur b0, [x1, #2]
-; BE-NEXT: strh w8, [x1]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w8, [sp, #8]
; BE-NEXT: stur b0, [x0, #2]
-; BE-NEXT: strh w8, [x0]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
@@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
; BE-NEXT: mov h0, v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
-; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w8, [sp, #8]
; BE-NEXT: stur b0, [x0, #2]
-; BE-NEXT: strh w8, [x0]
+; BE-NEXT: rev32 v1.4h, v1.4h
+; BE-NEXT: ushll v1.4s, v1.4h, #0
+; BE-NEXT: str h1, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 74a717f1635a..7cba0d608cd4 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1012,18 +1012,16 @@ define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) {
;
; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: adrp x8, .LCPI11_0
-; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI11_0
-; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT: mov x8, xzr
; CHECK-BE-NEXT: .LBB11_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: ldr s1, [x0, x8]
+; CHECK-BE-NEXT: ldr s0, [x0, x8]
; CHECK-BE-NEXT: add x8, x8, #16
; CHECK-BE-NEXT: cmp x8, #128
-; CHECK-BE-NEXT: rev32 v1.16b, v1.16b
-; CHECK-BE-NEXT: tbl v1.16b, { v1.16b }, v0.16b
-; CHECK-BE-NEXT: st1 { v1.16b }, [x1]
+; CHECK-BE-NEXT: rev32 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: st1 { v0.4s }, [x1]
; CHECK-BE-NEXT: add x1, x1, #64
; CHECK-BE-NEXT: b.ne .LBB11_1
; CHECK-BE-NEXT: // %bb.2: // %exit