summaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
authorVitaly Buka <vitalybuka@google.com>2024-09-23 15:55:29 -0700
committerVitaly Buka <vitalybuka@google.com>2024-09-23 15:55:29 -0700
commit80323f174971174928c87fb0e958a6fcfe094d59 (patch)
treebb0862b94fc42ba636ea993820a3368b851fd334 /llvm/test/CodeGen
parent1c4f36eefcbee84fe801c6817ff4cdc7feeafd13 (diff)
parent8dbb739ffb0880e4f739992d07dc6ba6edca9509 (diff)
Created using spr 1.3.4 [skip ci]
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir24
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir11
-rw-r--r--llvm/test/CodeGen/AArch64/bswap.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/concat-vector.ll16
-rw-r--r--llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll22
-rw-r--r--llvm/test/CodeGen/AArch64/fptoi.ll52
-rw-r--r--llvm/test/CodeGen/AArch64/itofp.ll11
-rw-r--r--llvm/test/CodeGen/AArch64/shift.ll90
-rw-r--r--llvm/test/CodeGen/AArch64/shufflevector.ll45
-rw-r--r--llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll156
-rw-r--r--llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll117
-rw-r--r--llvm/test/CodeGen/AArch64/xtn.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll73
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll74
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll82
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll144
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shrink-true16.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir55
-rw-r--r--llvm/test/CodeGen/AMDGPU/skip-if-dead.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll4
-rw-r--r--llvm/test/CodeGen/ARM/vbsl.ll176
-rw-r--r--llvm/test/CodeGen/Generic/allow-check.ll2
-rw-r--r--llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir8
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/ashr.ll343
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/lshr.ll335
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll135
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/shl.ll302
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/srem.ll131
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll60
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll133
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll133
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll133
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll133
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/udiv.ll131
-rw-r--r--llvm/test/CodeGen/Mips/llvm-ir/urem.ll143
-rw-r--r--llvm/test/CodeGen/NVPTX/fence-sm-90.ll30
-rw-r--r--llvm/test/CodeGen/NVPTX/fence.ll76
-rw-r--r--llvm/test/CodeGen/NVPTX/load-store-sm-70.ll2906
-rw-r--r--llvm/test/CodeGen/NVPTX/load-store-sm-90.ll1423
-rw-r--r--llvm/test/CodeGen/NVPTX/load-store.ll507
-rw-r--r--llvm/test/CodeGen/NVPTX/rotate.ll433
-rw-r--r--llvm/test/CodeGen/NVPTX/rotate_64.ll33
-rw-r--r--llvm/test/CodeGen/PowerPC/ctrloop-sh.ll240
-rw-r--r--llvm/test/CodeGen/PowerPC/pr59074.ll83
-rw-r--r--llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll1418
-rw-r--r--llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll702
-rw-r--r--llvm/test/CodeGen/RISCV/shifts.ll366
-rw-r--r--llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll5989
-rw-r--r--llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll3571
-rw-r--r--llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll415
-rw-r--r--llvm/test/CodeGen/X86/canonicalize-vars.ll636
-rw-r--r--llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll488
-rw-r--r--llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll505
-rw-r--r--llvm/test/CodeGen/X86/pr38539.ll160
-rw-r--r--llvm/test/CodeGen/X86/scheduler-backtracking.ll140
-rw-r--r--llvm/test/CodeGen/X86/section-stats.ll2
-rw-r--r--llvm/test/CodeGen/X86/shift-i128.ll657
-rw-r--r--llvm/test/CodeGen/X86/shift-i256.ll418
-rw-r--r--llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll23081
-rw-r--r--llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll7677
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll3560
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll1645
72 files changed, 46808 insertions, 13820 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
index 3e768c4d7a26..03c28efe7e09 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
@@ -159,25 +159,13 @@ body: |
; CHECK-LABEL: name: test_freeze_v3s8
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
- ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
- ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16)
- ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>)
- ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s16>) = G_FREEZE [[UV4]]
- ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[FREEZE]](<4 x s16>)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]]
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>)
; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
- ; CHECK-NEXT: %ext0:_(s32) = G_AND [[ANYEXT1]], [[C]]
- ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
- ; CHECK-NEXT: %ext1:_(s32) = G_AND [[ANYEXT2]], [[C]]
- ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
- ; CHECK-NEXT: %ext2:_(s32) = G_AND [[ANYEXT3]], [[C]]
+ ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8)
+ ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8)
+ ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8)
; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32)
; CHECK-NEXT: $q0 = COPY %res(<4 x s32>)
%x:_(<3 x s8>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
index 9a8697c1d9b8..11c6c7fb40fa 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir
@@ -248,13 +248,10 @@ body: |
; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16)
; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16)
; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16)
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>)
- ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16)
- ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16)
- ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV8]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef)
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>)
; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index e90014be21de..b14f1a43b7dc 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -177,9 +177,7 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){
;
; CHECK-GI-LABEL: bswap_v2i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: rev16 v0.8b, v0.8b
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 18570b2d793f..eee917e8acb0 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -183,15 +183,12 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
;
; CHECK-GI-LABEL: concat_v8s16_v2s16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: ldr h2, [x0, #2]
-; CHECK-GI-NEXT: dup v0.4s, w8
-; CHECK-GI-NEXT: mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT: xtn v2.4h, v0.4s
-; CHECK-GI-NEXT: xtn v1.4h, v1.4s
-; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: ldr h0, [x0]
+; CHECK-GI-NEXT: ldr h1, [x0, #2]
+; CHECK-GI-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: mov v0.s[0], w8
-; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: mov v0.s[3], w8
@@ -209,10 +206,7 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
;
; CHECK-GI-LABEL: concat_v16s8_v4s8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v0.8h, w8
-; CHECK-GI-NEXT: xtn v1.8b, v0.8h
; CHECK-GI-NEXT: ldr s0, [x0]
-; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: mov v0.s[1], w8
; CHECK-GI-NEXT: mov v0.s[2], w8
; CHECK-GI-NEXT: mov v0.s[3], w8
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index aa20304e52a9..a9618fdc2dec 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -3,24 +3,10 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
-; CHECK-SD-LABEL: interleave2_v4f16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: interleave2_v4f16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: dup v2.4s, w8
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: xtn v0.4h, v2.4s
-; CHECK-GI-NEXT: mov v1.s[0], w8
-; CHECK-GI-NEXT: mov v2.s[0], w9
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mov v2.s[1], w8
-; CHECK-GI-NEXT: zip1 v0.4h, v1.4h, v2.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: interleave2_v4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
%retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
ret <4 x half> %retval
}
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 20b5567e973d..f72a49f6ab7c 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -3172,42 +3172,22 @@ entry:
}
define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
-; CHECK-SD-LABEL: fptos_v3f32_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptos_v3f32_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[2], w9
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptos_v3f32_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzs v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptosi <3 x float> %a to <3 x i16>
ret <3 x i16> %c
}
define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
-; CHECK-SD-LABEL: fptou_v3f32_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fptou_v3f32_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[2], w9
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fptou_v3f32_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%c = fptoui <3 x float> %a to <3 x i16>
ret <3 x i16> %c
@@ -6077,11 +6057,7 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
-; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16:
@@ -6110,11 +6086,7 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
; CHECK-GI-NOFP16: // %bb.0: // %entry
; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8
-; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9
-; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16:
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 4ac04798e154..f70ec0f35cb5 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -7450,9 +7450,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
;
; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -7493,9 +7491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
;
; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
; CHECK-GI-FP16: // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
@@ -8059,8 +8055,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
; CHECK-GI-FP16-NEXT: movi d1, #0x0000ff000000ff
; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT: mov w8, v0.s[1]
-; CHECK-GI-FP16-NEXT: mov v0.h[1], w8
+; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h
; CHECK-GI-FP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 7014a4a9acbe..54f7887aee8d 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -531,26 +531,8 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
;
; CHECK-GI-LABEL: shl_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v1.h[1]
-; CHECK-GI-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[2]
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov h3, v1.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: mov v1.b[1], w9
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: mov v1.b[3], w9
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
@@ -592,12 +574,8 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
;
; CHECK-GI-LABEL: shl_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
@@ -741,26 +719,8 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
;
; CHECK-GI-LABEL: ashr_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[3]
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v1.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v1.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: neg v1.8b, v1.8b
; CHECK-GI-NEXT: sshl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
@@ -802,12 +762,8 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
;
; CHECK-GI-LABEL: ashr_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v1.s[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: neg v1.4h, v1.4h
; CHECK-GI-NEXT: sshl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
@@ -946,26 +902,8 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
;
; CHECK-GI-LABEL: lshr_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v1.h[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NEXT: mov h4, v1.h[2]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[3]
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: mov h4, v0.h[3]
-; CHECK-GI-NEXT: mov v1.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov h3, v0.h[2]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s3
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v1.b[3], w8
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: neg v1.8b, v1.8b
; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mov b1, v0.b[1]
@@ -1006,12 +944,8 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
;
; CHECK-GI-LABEL: lshr_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v1.s[1]
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w9, v0.s[1]
-; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[1], w9
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: neg v1.4h, v1.4h
; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: mov h1, v0.h[1]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index 954458e44597..5f4ff1e64673 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -209,27 +209,9 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NEXT: mov h3, v1.h[1]
-; CHECK-GI-NEXT: mov h4, v0.h[2]
-; CHECK-GI-NEXT: mov h5, v0.h[3]
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov h2, v1.h[2]
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov h3, v1.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: mov v1.b[1], w9
-; CHECK-GI-NEXT: fmov w8, s4
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: mov v1.b[2], w9
-; CHECK-GI-NEXT: fmov w8, s5
-; CHECK-GI-NEXT: fmov w9, s3
-; CHECK-GI-NEXT: mov v0.b[3], w8
-; CHECK-GI-NEXT: mov v1.b[3], w9
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: adrp x8, .LCPI15_0
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
@@ -284,13 +266,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v1.h[1], w9
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: adrp x8, .LCPI17_0
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
@@ -403,16 +381,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov h1, v0.h[1]
-; CHECK-GI-NEXT: mov h2, v0.h[2]
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov h1, v0.h[3]
-; CHECK-GI-NEXT: mov v0.b[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.b[2], w8
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.b[3], w8
+; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: dup v0.8b, v0.b[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
@@ -448,9 +417,7 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index e21015ad3db3..b02788ab1b34 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -186,10 +186,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
; ALL-NEXT: mov x8, sp
-; ALL-NEXT: and x9, x10, #0x1f
+; ALL-NEXT: and x9, x10, #0x18
; ALL-NEXT: str q1, [sp]
; ALL-NEXT: add x8, x8, x9
+; ALL-NEXT: lsl x9, x10, #3
; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: ldp x11, x10, [x8, #16]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsr x11, x11, x9
+; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: lsr x10, x10, x9
+; ALL-NEXT: lsr x12, x12, x9
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x8, x8, x9
+; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x13, x15, x13
+; ALL-NEXT: orr x11, x14, x11
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x11, x10, [x2, #16]
+; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: lshr_32bytes_dwordOff:
+; ALL: // %bb.0:
+; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldp x9, x8, [x0, #16]
+; ALL-NEXT: movi v0.2d, #0000000000000000
+; ALL-NEXT: ldr x10, [x1]
+; ALL-NEXT: ldr q1, [x0]
+; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: ubfiz x8, x10, #3, #2
+; ALL-NEXT: mov x9, sp
+; ALL-NEXT: str q1, [sp]
+; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: add x8, x9, x8
; ALL-NEXT: ldp x10, x9, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
@@ -197,12 +241,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = lshr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes:
; ALL: // %bb.0:
@@ -213,11 +258,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #48]
; ALL-NEXT: mov x8, sp
-; ALL-NEXT: and x9, x10, #0x1f
+; ALL-NEXT: and x9, x10, #0x18
; ALL-NEXT: add x8, x8, #32
; ALL-NEXT: stp q0, q0, [sp]
; ALL-NEXT: str q1, [sp, #32]
; ALL-NEXT: sub x8, x8, x9
+; ALL-NEXT: lsl x9, x10, #3
+; ALL-NEXT: ldp x10, x11, [x8]
+; ALL-NEXT: ldp x12, x8, [x8, #16]
+; ALL-NEXT: mvn w13, w9
+; ALL-NEXT: and x9, x9, #0x38
+; ALL-NEXT: lsr x14, x10, #1
+; ALL-NEXT: lsr x15, x11, #1
+; ALL-NEXT: lsl x11, x11, x9
+; ALL-NEXT: lsr x16, x12, #1
+; ALL-NEXT: lsl x10, x10, x9
+; ALL-NEXT: lsl x12, x12, x9
+; ALL-NEXT: lsr x14, x14, x13
+; ALL-NEXT: lsl x8, x8, x9
+; ALL-NEXT: lsr x9, x16, x13
+; ALL-NEXT: lsr x13, x15, x13
+; ALL-NEXT: orr x11, x11, x14
+; ALL-NEXT: orr x8, x8, x9
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x10, x11, [x2]
+; ALL-NEXT: stp x9, x8, [x2, #16]
+; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: shl_32bytes_dwordOff:
+; ALL: // %bb.0:
+; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldp x9, x8, [x0, #16]
+; ALL-NEXT: movi v0.2d, #0000000000000000
+; ALL-NEXT: ldr x10, [x1]
+; ALL-NEXT: ldr q1, [x0]
+; ALL-NEXT: stp x9, x8, [sp, #48]
+; ALL-NEXT: mov x8, sp
+; ALL-NEXT: ubfiz x9, x10, #3, #2
+; ALL-NEXT: add x8, x8, #32
+; ALL-NEXT: stp q0, q1, [sp, #16]
+; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: sub x8, x8, x9
; ALL-NEXT: ldp x9, x10, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
@@ -225,12 +315,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = shl i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes:
; ALL: // %bb.0:
@@ -238,14 +329,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
-; ALL-NEXT: and x10, x10, #0x1f
+; ALL-NEXT: and x11, x10, #0x18
; ALL-NEXT: stp x9, x8, [sp, #16]
; ALL-NEXT: asr x8, x8, #63
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q0, [sp]
+; ALL-NEXT: add x9, x9, x11
+; ALL-NEXT: stp x8, x8, [sp, #48]
+; ALL-NEXT: stp x8, x8, [sp, #32]
+; ALL-NEXT: lsl x8, x10, #3
+; ALL-NEXT: ldp x11, x10, [x9, #16]
+; ALL-NEXT: ldp x9, x12, [x9]
+; ALL-NEXT: mvn w13, w8
+; ALL-NEXT: and x8, x8, #0x38
+; ALL-NEXT: lsl x14, x10, #1
+; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsr x11, x11, x8
+; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: asr x10, x10, x8
+; ALL-NEXT: lsr x12, x12, x8
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x8, x9, x8
+; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x13, x15, x13
+; ALL-NEXT: orr x11, x14, x11
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: stp x11, x10, [x2, #16]
+; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; ALL-LABEL: ashr_32bytes_dwordOff:
+; ALL: // %bb.0:
+; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: ldp x9, x8, [x0, #16]
+; ALL-NEXT: ldr x10, [x1]
+; ALL-NEXT: ldr q0, [x0]
+; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: asr x8, x8, #63
+; ALL-NEXT: ubfiz x9, x10, #3, #2
+; ALL-NEXT: mov x10, sp
+; ALL-NEXT: str q0, [sp]
; ALL-NEXT: stp x8, x8, [sp, #48]
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: add x8, x9, x10
+; ALL-NEXT: add x8, x10, x9
; ALL-NEXT: ldp x10, x9, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
@@ -253,8 +389,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index a4da6db57eca..531e0fa740da 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -160,30 +160,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: ubfx x8, x10, #3, #5
+; ALL-NEXT: lsr x8, x10, #3
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q1, [sp]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: stp q0, q0, [sp, #32]
+; ALL-NEXT: eor x12, x12, #0x3f
; ALL-NEXT: add x8, x9, x8
-; ALL-NEXT: mvn w13, w10
-; ALL-NEXT: ldp x11, x9, [x8, #16]
-; ALL-NEXT: ldp x8, x12, [x8]
+; ALL-NEXT: ldp x13, x11, [x8]
+; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x8, [x8, #16]
; ALL-NEXT: lsl x14, x9, #1
+; ALL-NEXT: lsr x9, x9, x10
; ALL-NEXT: lsl x15, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x16, x12, #1
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsr x12, x12, x10
-; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsl x14, x14, x12
+; ALL-NEXT: lsl x12, x15, x12
+; ALL-NEXT: lsl x15, x8, #1
; ALL-NEXT: lsr x8, x8, x10
-; ALL-NEXT: lsl x10, x16, x13
-; ALL-NEXT: lsl x13, x15, x13
-; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: stp x11, x9, [x2, #16]
-; ALL-NEXT: orr x8, x10, x8
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: orr x8, x14, x8
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x9, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -201,31 +204,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #48]
-; ALL-NEXT: mov x8, sp
-; ALL-NEXT: ubfx x9, x10, #3, #5
-; ALL-NEXT: add x8, x8, #32
+; ALL-NEXT: lsr x8, x10, #3
+; ALL-NEXT: mov x9, sp
+; ALL-NEXT: add x9, x9, #32
; ALL-NEXT: stp q0, q1, [sp, #16]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: sub x8, x8, x9
-; ALL-NEXT: mvn w13, w10
-; ALL-NEXT: ldp x9, x11, [x8]
-; ALL-NEXT: ldp x12, x8, [x8, #16]
-; ALL-NEXT: lsr x14, x9, #1
-; ALL-NEXT: lsr x15, x11, #1
-; ALL-NEXT: lsl x11, x11, x10
-; ALL-NEXT: lsr x16, x12, #1
+; ALL-NEXT: eor x12, x12, #0x3f
+; ALL-NEXT: sub x8, x9, x8
+; ALL-NEXT: ldp x11, x13, [x8, #16]
+; ALL-NEXT: ldr x9, [x8]
+; ALL-NEXT: ldr x8, [x8, #8]
+; ALL-NEXT: lsr x15, x9, #1
; ALL-NEXT: lsl x9, x9, x10
-; ALL-NEXT: lsl x12, x12, x10
-; ALL-NEXT: lsr x14, x14, x13
+; ALL-NEXT: lsr x14, x11, #1
+; ALL-NEXT: lsl x11, x11, x10
+; ALL-NEXT: lsl x13, x13, x10
+; ALL-NEXT: lsr x14, x14, x12
+; ALL-NEXT: lsr x12, x15, x12
+; ALL-NEXT: lsr x15, x8, #1
; ALL-NEXT: lsl x8, x8, x10
-; ALL-NEXT: lsr x10, x16, x13
-; ALL-NEXT: lsr x13, x15, x13
-; ALL-NEXT: orr x11, x11, x14
-; ALL-NEXT: stp x9, x11, [x2]
-; ALL-NEXT: orr x8, x8, x10
-; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x9, x8, [x2, #16]
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsr x10, x15, x10
+; ALL-NEXT: orr x8, x8, x12
+; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: orr x9, x13, x14
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
@@ -243,31 +249,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
+; ALL-NEXT: lsr x9, x10, #3
; ALL-NEXT: asr x8, x8, #63
-; ALL-NEXT: ubfx x9, x10, #3, #5
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: and x10, x10, #0x7
+; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x9, x9, #0x18
; ALL-NEXT: stp x8, x8, [sp, #48]
-; ALL-NEXT: add x9, x11, x9
-; ALL-NEXT: mvn w13, w10
+; ALL-NEXT: eor x12, x12, #0x3f
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: ldp x11, x8, [x9, #16]
-; ALL-NEXT: ldp x9, x12, [x9]
-; ALL-NEXT: lsl x14, x8, #1
+; ALL-NEXT: add x8, x11, x9
+; ALL-NEXT: ldp x13, x11, [x8]
+; ALL-NEXT: ldr x9, [x8, #24]
+; ALL-NEXT: ldr x8, [x8, #16]
+; ALL-NEXT: lsl x14, x9, #1
+; ALL-NEXT: asr x9, x9, x10
; ALL-NEXT: lsl x15, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x16, x12, #1
-; ALL-NEXT: asr x8, x8, x10
-; ALL-NEXT: lsr x12, x12, x10
-; ALL-NEXT: lsl x14, x14, x13
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsl x10, x16, x13
-; ALL-NEXT: lsl x13, x15, x13
-; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: stp x11, x8, [x2, #16]
-; ALL-NEXT: orr x8, x10, x9
+; ALL-NEXT: lsr x13, x13, x10
+; ALL-NEXT: lsl x14, x14, x12
+; ALL-NEXT: lsl x12, x15, x12
+; ALL-NEXT: lsl x15, x8, #1
+; ALL-NEXT: lsr x8, x8, x10
+; ALL-NEXT: mvn w10, w10
+; ALL-NEXT: lsl x10, x15, x10
+; ALL-NEXT: orr x8, x14, x8
+; ALL-NEXT: stp x8, x9, [x2, #16]
; ALL-NEXT: orr x9, x12, x13
-; ALL-NEXT: stp x8, x9, [x2]
+; ALL-NEXT: orr x8, x11, x10
+; ALL-NEXT: stp x9, x8, [x2]
; ALL-NEXT: add sp, sp, #64
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index ead790203f94..fb3f8ebd7d14 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -294,19 +294,10 @@ entry:
}
define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
-; CHECK-SD-LABEL: xtn_v3i32_v3i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: xtn v0.4h, v0.4s
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: xtn_v3i32_v3i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, v0.s[1]
-; CHECK-GI-NEXT: mov w9, v0.s[2]
-; CHECK-GI-NEXT: mov v0.h[1], w8
-; CHECK-GI-NEXT: mov v0.h[2], w9
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: xtn_v3i32_v3i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: ret
entry:
%arg1 = trunc <3 x i32> %a to <3 x i16>
ret <3 x i16> %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index bb7bc0447aea..c5ded11c7d32 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -167,8 +167,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
; GFX10-NEXT: s_mov_b32 s7, 1
+; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
; GFX10-NEXT: s_cbranch_vccz .LBB3_4
; GFX10-NEXT: ; %bb.3: ; %else
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 49c232661c6d..b27d8fdc24ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -75,12 +75,12 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
; GFX10-NEXT: .LBB1_1: ; %loop.cond
; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
-; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo
; GFX10-NEXT: s_and_b32 s8, exec_lo, s6
+; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0
; GFX10-NEXT: s_or_b32 s4, s7, s8
; GFX10-NEXT: s_cbranch_vccz .LBB1_4
; GFX10-NEXT: .LBB1_2: ; %loop.start
@@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
new file mode 100644
index 000000000000..c8ba6722d9d8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s
+
+; OBJDUMP: Contents of section .rodata:
+; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................
+; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................
+; OBJDUMP-NEXT: 0030 4000af00 94130000 1a000400 00000000 @...............
+
+; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
+; ASM: .amdhsa_user_sgpr_count 10
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; NumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
+
+; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
+; feild that are not explicitly referenced in the kernel. This test has 6 implicit
+; user SPGRs enabled, 4 preloaded kernarg SGPRs, plus 6 extra SGPRs allocated
+; for flat scratch, ect. The total number of allocated SGPRs encoded in the
+; kernel descriptor should be 16. That's a 1 in the KD field since the granule
+; size is 8 and it's NumGranules - 1. The encoding for that looks like '40'.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void }
+
+; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ .......
+; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @...............
+
+; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
+; ASM: .amdhsa_user_sgpr_count 10
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; NumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
+
+; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
+; implicit, and 6 extra.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void }
+
+; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000 @...............
+
+; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
+; ASM: .amdhsa_user_sgpr_count 3
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; NumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
+
+; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void }
+
+; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................
+; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................
+; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................
+
+; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
+; ASM: .amdhsa_user_sgpr_count 2
+; ASM: .amdhsa_next_free_sgpr 0
+; ASM: ; NumSgprs: 6
+; ASM: ; NumSGPRsForWavesPerEU: 6
+
+; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
+; Encoded like '00'.
+
+define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void }
+
+attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b17dfc7c3754..ce608df44dc4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1323,9 +1323,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1451,10 +1451,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1587,9 +1586,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -3228,8 +3227,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -4991,9 +4990,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s0, s2
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5119,10 +5118,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s4, s6
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -5255,9 +5253,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16
; GFX1232_DPP-NEXT: s_wait_alu 0xfffe
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s6
; GFX1232_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2
@@ -6938,8 +6936,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s2, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 988bc8eec6e5..ce90fbed8131 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -936,8 +936,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -1047,8 +1047,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2
@@ -2684,8 +2684,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -2874,8 +2874,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2
@@ -3383,8 +3383,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11
@@ -4444,8 +4444,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2
@@ -4555,8 +4555,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2
@@ -6218,8 +6218,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2
@@ -6408,8 +6408,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2
@@ -6915,8 +6915,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2
@@ -7026,9 +7026,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2
@@ -7627,8 +7626,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2
@@ -7786,8 +7785,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2
@@ -8294,8 +8293,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2
@@ -8405,8 +8404,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2
@@ -9006,8 +9005,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2
@@ -9165,8 +9164,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2
@@ -9673,8 +9672,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2
@@ -9784,8 +9783,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2
@@ -10385,8 +10384,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2
@@ -10544,8 +10543,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2
@@ -11051,8 +11050,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2
@@ -11162,9 +11161,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2
@@ -12196,8 +12194,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2
@@ -12415,8 +12413,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2
@@ -12923,8 +12921,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2
@@ -13034,9 +13032,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2
@@ -14788,8 +14785,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2
@@ -14899,8 +14896,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2
@@ -15909,8 +15906,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2
@@ -16125,8 +16122,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2
@@ -16633,8 +16630,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2
@@ -16744,9 +16741,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2
@@ -17754,8 +17750,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2
@@ -17970,8 +17966,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 2b18f472c8c4..c3a197ce9985 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1263,16 +1263,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
@@ -1483,16 +1483,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -2471,16 +2471,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
@@ -2721,16 +2721,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -4503,16 +4503,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
@@ -4753,16 +4753,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5929,19 +5929,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s33, s8
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
@@ -6378,19 +6378,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -7595,8 +7595,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -8020,16 +8020,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
@@ -8277,16 +8277,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -9107,8 +9107,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -9444,16 +9444,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
@@ -9701,16 +9701,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -10531,8 +10531,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -11437,8 +11437,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -13574,8 +13574,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e3144ae24ae8..69c6adf0300c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
@@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
;
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
@@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
;
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ddc103184cdf..b7890f30f776 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
@@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
;
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_add_u32 s48, s48, s9
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
@@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
;
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index f353edff1b47..fcd5d0dc497e 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1367,16 +1367,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
@@ -1617,16 +1617,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -2687,16 +2687,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
@@ -2937,16 +2937,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -4823,16 +4823,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
@@ -5073,16 +5073,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -6249,19 +6249,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-NEXT: s_mov_b32 s33, s8
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-NEXT: s_mov_b32 s44, 0
; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
@@ -6698,19 +6698,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -7915,8 +7915,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -8340,16 +8340,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
@@ -8597,16 +8597,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -9426,8 +9426,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -9763,16 +9763,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s14, -1
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
@@ -10020,16 +10020,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
-; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
@@ -10850,8 +10850,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -11756,8 +11756,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
@@ -13892,8 +13892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index b061d53de5d3..39a3b1c8adc9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -2,11 +2,118 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
-declare i32 @llvm.amdgcn.readfirstlane(i32) #0
-declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0
-declare double @llvm.amdgcn.readfirstlane.f64(double) #0
+define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i1:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i1:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %src)
+ store i1 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i1_inreg:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: s_and_b32 s4, s6, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i1_inreg:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: s_and_b32 s4, s6, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %src)
+ store i1 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i1_select:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2
+; CHECK-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-SDAG-NEXT: s_bitcmp1_b32 s4, 0
+; CHECK-SDAG-NEXT: s_cselect_b64 vcc, -1, 0
+; CHECK-SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i1_select:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2
+; CHECK-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v4
+; CHECK-GISEL-NEXT: s_and_b32 s4, 1, s4
+; CHECK-GISEL-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; CHECK-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cmp = icmp ugt i32 %src, 42
+ %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %cmp)
+ %sel = select i1 %readfirstlane, i32 %src, i32 %src1
+ store i32 %sel, ptr addrspace(1) %out, align 4
+ ret void
+}
-define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
+define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; CHECK-SDAG-LABEL: test_readfirstlane_i1_load:
+; CHECK-SDAG: ; %bb.0:
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-SDAG-NEXT: flat_load_ubyte v2, v[2:3]
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2
+; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0)
+; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CHECK-GISEL-LABEL: test_readfirstlane_i1_load:
+; CHECK-GISEL: ; %bb.0:
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-GISEL-NEXT: flat_load_ubyte v2, v[2:3]
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2
+; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0)
+; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %load = load i1, ptr addrspace(1) %in
+ %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %load)
+ store i1 %readfirstlane, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -29,7 +136,7 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 {
ret void
}
-define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
+define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -56,7 +163,7 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 {
ret void
}
-define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
+define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) {
; CHECK-SDAG-LABEL: test_readfirstlane_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -83,7 +190,7 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_mov_b32 s0, 32
@@ -104,7 +211,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32
@@ -125,7 +232,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_mov_b32 s0, 0
@@ -148,7 +255,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -173,7 +280,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -201,7 +308,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -230,7 +337,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_m0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -262,7 +369,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -294,7 +401,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -328,7 +435,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -362,7 +469,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
ret void
}
-define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_fi:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15
@@ -593,6 +700,3 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) {
call void asm sideeffect "; use $0", "s"(<8 x i16> %x)
ret void
}
-
-attributes #0 = { nounwind readnone convergent }
-attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 684ca3aac7c3..004a720b9ab4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -216,8 +216,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
; GFX10-32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0
+; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0
; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2
; GFX10-32-NEXT: s_cbranch_execz .LBB2_3
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index f60786c1bacb..6f841c88a6d8 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -4,8 +4,8 @@
define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_then:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
; GCN-NEXT: v_mov_b32_e32 v3, 1
@@ -60,8 +60,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i
define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) {
; GCN-LABEL: if_else_vgpr_opt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: ; %bb.1: ; %.bb0
; GCN-NEXT: v_mov_b32_e32 v3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
index 90b32e29e98f..3519befabd3b 100644
--- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll
@@ -4,10 +4,10 @@
define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 {
; GCN-LABEL: should_not_hoist_set_inactive:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1
; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0
; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2
; GCN-NEXT: s_mov_b32 s7, 0
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1
; GCN-NEXT: s_branch .LBB0_2
; GCN-NEXT: .LBB0_1: ; %bb4
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
index 1a7ec5db9efa..be759049bc3a 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir
@@ -11,7 +11,7 @@ body: |
; GFX1100-LABEL: name: 16bit_lo128_shrink
; GFX1100: liveins: $vgpr127
; GFX1100-NEXT: {{ $}}
- ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc, implicit $exec, implicit $exec
+ ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc_lo, implicit $exec, implicit $exec
$vcc_lo = V_CMP_EQ_U16_t16_e64 0, $vgpr127, implicit-def $vcc, implicit $exec
...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
new file mode 100644
index 000000000000..73c55265af20
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-shrink-instructions -mcpu=gfx1100 -o - %s | FileCheck %s
+
+# Make sure there's no crash when shrinking a v_cmp on a wave32 target
+# when the def is dead. Previously the vcc implicit def wasn't
+# properly replaced with vcc_lo, so the expected implicit operand was
+# not found in the shrunk instruction.
+
+---
+name: shrink_v_cmp_vcc_lo_dead
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ maxCallFrameSize: 0
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_dead
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def dead $vcc_lo, implicit $exec
+ ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31
+ dead renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31
+
+...
+
+---
+name: shrink_v_cmp_vcc_lo_live
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+ maxAlignment: 1
+ maxCallFrameSize: 0
+ isCalleeSavedInfoValid: true
+machineFunctionInfo:
+ maxKernArgAlign: 1
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_live
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec
+ ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo
+ renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index eebd32cd67e6..8e0a83671a18 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1027,8 +1027,8 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
;
; GFX10-WAVE32-LABEL: test_kill_divergent_loop:
; GFX10-WAVE32: ; %bb.0: ; %entry
-; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
+; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
index 25d8300eb458..a0bce3432a4b 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll
@@ -86,8 +86,8 @@ end:
define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
; SI-LABEL: else3:
; SI: ; %bb.0: ; %entry
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_mov_b32 s1, 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_branch .LBB2_2
; SI-NEXT: .LBB2_1: ; %if.end
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
@@ -161,16 +161,16 @@ for.end:
define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
; SI-LABEL: loop:
; SI: ; %bb.0: ; %main_body
-; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: v_mov_b32_e32 v6, v0
; SI-NEXT: v_mov_b32_e32 v0, v1
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: s_mov_b32 s15, 0x31c16000
; SI-NEXT: s_add_u32 s12, s12, s1
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_mov_b32 s32, 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6
; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s6, exec_lo, s0
@@ -243,11 +243,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: v_mov_b32_e32 v40, v1
-; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: s_mov_b32 s15, 0x31c16000
; SI-NEXT: s_add_u32 s12, s12, s1
; SI-NEXT: s_addc_u32 s13, s13, 0
; SI-NEXT: s_mov_b32 s32, 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo
; SI-NEXT: s_xor_b32 s6, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 92117e0688f6..4576d829b0cb 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -372,8 +372,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: .LBB10_2: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
-; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
; GFX1032-NEXT: ; %bb.3: ; %bb5
@@ -515,8 +515,8 @@ bb13:
define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_loop_with_if_else_break:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: ; %bb.1: ; %.preheader
diff --git a/llvm/test/CodeGen/ARM/vbsl.ll b/llvm/test/CodeGen/ARM/vbsl.ll
index 735fa5182fe7..8564a48fbc3d 100644
--- a/llvm/test/CodeGen/ARM/vbsl.ll
+++ b/llvm/test/CodeGen/ARM/vbsl.ll
@@ -1,17 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
-
-; rdar://12471808
+; RUN: llc -mtriple=armv7-eabihf -mattr=+neon %s -o - | FileCheck %s
define <8 x i8> @v_bsli8(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: v_bsli8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vldr d16, [r2]
+; CHECK-NEXT: vorr d0, d18, d18
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d17, d16
+; CHECK-NEXT: bx lr
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = load <8 x i8>, ptr %B
%tmp3 = load <8 x i8>, ptr %C
@@ -27,10 +25,10 @@ define <4 x i16> @v_bsli16(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vldr d16, [r2]
+; CHECK-NEXT: vorr d0, d18, d18
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d17, d16
+; CHECK-NEXT: bx lr
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i16>, ptr %C
@@ -46,10 +44,10 @@ define <2 x i32> @v_bsli32(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vldr d16, [r2]
+; CHECK-NEXT: vorr d0, d18, d18
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d17, d16
+; CHECK-NEXT: bx lr
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i32>, ptr %C
@@ -65,10 +63,10 @@ define <1 x i64> @v_bsli64(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d18, [r0]
; CHECK-NEXT: vldr d16, [r2]
+; CHECK-NEXT: vorr d0, d18, d18
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d17, d16
+; CHECK-NEXT: bx lr
%tmp1 = load <1 x i64>, ptr %A
%tmp2 = load <1 x i64>, ptr %B
%tmp3 = load <1 x i64>, ptr %C
@@ -83,12 +81,11 @@ define <16 x i8> @v_bslQi8(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: v_bslQi8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
+; CHECK-NEXT: vorr q0, q10, q10
; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: vbit q8, q9, q10
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q9, q8
+; CHECK-NEXT: bx lr
%tmp1 = load <16 x i8>, ptr %A
%tmp2 = load <16 x i8>, ptr %B
%tmp3 = load <16 x i8>, ptr %C
@@ -103,12 +100,11 @@ define <8 x i16> @v_bslQi16(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: v_bslQi16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
+; CHECK-NEXT: vorr q0, q10, q10
; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: vbit q8, q9, q10
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q9, q8
+; CHECK-NEXT: bx lr
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = load <8 x i16>, ptr %C
@@ -123,12 +119,11 @@ define <4 x i32> @v_bslQi32(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: v_bslQi32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
+; CHECK-NEXT: vorr q0, q10, q10
; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: vbit q8, q9, q10
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q9, q8
+; CHECK-NEXT: bx lr
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
@@ -143,12 +138,11 @@ define <2 x i64> @v_bslQi64(ptr %A, ptr %B, ptr %C) nounwind {
; CHECK-LABEL: v_bslQi64:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
+; CHECK-NEXT: vorr q0, q10, q10
; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: vbit q8, q9, q10
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q9, q8
+; CHECK-NEXT: bx lr
%tmp1 = load <2 x i64>, ptr %A
%tmp2 = load <2 x i64>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
@@ -162,12 +156,8 @@ define <2 x i64> @v_bslQi64(ptr %A, ptr %B, ptr %C) nounwind {
define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: f1:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d16, [sp]
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d1, d2
+; CHECK-NEXT: bx lr
%vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
ret <8 x i8> %vbsl.i
}
@@ -175,12 +165,8 @@ define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone opt
define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: f2:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d16, [sp]
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d1, d2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
ret <4 x i16> %vbsl3.i
}
@@ -188,12 +174,8 @@ define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone
define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: f3:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d16, [sp]
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d1, d2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
ret <2 x i32> %vbsl3.i
}
@@ -201,12 +183,8 @@ define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone
define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: f4:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d16, [sp]
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d1, d2
+; CHECK-NEXT: bx lr
%vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
ret <2 x float> %vbsl4.i
}
@@ -214,16 +192,8 @@ define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: g1:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d19, r2, r3
-; CHECK-NEXT: add r12, sp, #16
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vbit q8, q10, q9
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q1, q2
+; CHECK-NEXT: bx lr
%vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
ret <16 x i8> %vbsl.i
}
@@ -231,16 +201,8 @@ define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone
define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: g2:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d19, r2, r3
-; CHECK-NEXT: add r12, sp, #16
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vbit q8, q10, q9
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q1, q2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
ret <8 x i16> %vbsl3.i
}
@@ -248,16 +210,8 @@ define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone
define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: g3:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d19, r2, r3
-; CHECK-NEXT: add r12, sp, #16
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vbit q8, q10, q9
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q1, q2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
ret <4 x i32> %vbsl3.i
}
@@ -265,16 +219,8 @@ define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone
define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: g4:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d19, r2, r3
-; CHECK-NEXT: add r12, sp, #16
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vbit q8, q10, q9
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q1, q2
+; CHECK-NEXT: bx lr
%vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind
ret <4 x float> %vbsl4.i
}
@@ -282,12 +228,8 @@ define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind
define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: test_vbsl_s64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d16, [sp]
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d1, d2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
ret <1 x i64> %vbsl3.i
}
@@ -295,12 +237,8 @@ define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwi
define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: test_vbsl_u64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vldr d16, [sp]
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: vbit d16, d17, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl d0, d1, d2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
ret <1 x i64> %vbsl3.i
}
@@ -308,16 +246,8 @@ define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwi
define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: test_vbslq_s64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d19, r2, r3
-; CHECK-NEXT: add r12, sp, #16
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vbit q8, q10, q9
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q1, q2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
ret <2 x i64> %vbsl3.i
}
@@ -325,16 +255,8 @@ define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounw
define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
; CHECK-LABEL: test_vbslq_u64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d19, r2, r3
-; CHECK-NEXT: add r12, sp, #16
-; CHECK-NEXT: vmov d18, r0, r1
-; CHECK-NEXT: mov r0, sp
-; CHECK-NEXT: vld1.64 {d16, d17}, [r12]
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT: vbit q8, q10, q9
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d17
-; CHECK-NEXT: mov pc, lr
+; CHECK-NEXT: vbsl q0, q1, q2
+; CHECK-NEXT: bx lr
%vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
ret <2 x i64> %vbsl3.i
}
diff --git a/llvm/test/CodeGen/Generic/allow-check.ll b/llvm/test/CodeGen/Generic/allow-check.ll
index a08488959862..148ee811ea80 100644
--- a/llvm/test/CodeGen/Generic/allow-check.ll
+++ b/llvm/test/CodeGen/Generic/allow-check.ll
@@ -1,5 +1,5 @@
; Avoid `!DL->isLittleEndian() && !CLI->enableBigEndian()` missmatch on PPC64BE.
-; REQUIRES: host-byteorder-little-endian
+; REQUIRES: target-byteorder-little-endian
; -global-isel=1 is unsupported.
; XFAIL: target=loongarch{{.*}}
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 58e2e644b000..a40b4d85773b 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -40,9 +40,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
- %0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0
+ %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test_param_0
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32_avar 0, 4, 1, 0, 32, &test_param_1
+ %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test_param_1
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00
%3 = FADD_rnf64ri %1, double 3.250000e+00
%4 = CVT_f32_f64 %3, 5
@@ -66,9 +66,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
- %0 = LD_f32_avar 0, 4, 1, 2, 32, &test2_param_0
+ %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test2_param_0
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32_avar 0, 4, 1, 0, 32, &test2_param_1
+ %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test2_param_1
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000
%3 = FADD_rnf64ri %1, double 0x7FF8000000000000
%4 = CVT_f32_f64 %3, 5
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
index 450fe968d491..2b8129acb91f 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -382,53 +382,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS: # %bb.0: # %entry
; MIPS-NEXT: addiu $sp, $sp, -32
; MIPS-NEXT: .cfi_def_cfa_offset 32
-; MIPS-NEXT: swl $7, 28($sp)
-; MIPS-NEXT: swl $6, 24($sp)
; MIPS-NEXT: sra $1, $4, 31
-; MIPS-NEXT: swl $5, 20($sp)
-; MIPS-NEXT: swl $4, 16($sp)
-; MIPS-NEXT: swl $1, 12($sp)
-; MIPS-NEXT: swl $1, 8($sp)
-; MIPS-NEXT: swl $1, 4($sp)
-; MIPS-NEXT: swl $1, 0($sp)
-; MIPS-NEXT: addiu $2, $sp, 0
-; MIPS-NEXT: swr $7, 31($sp)
-; MIPS-NEXT: swr $6, 27($sp)
-; MIPS-NEXT: swr $5, 23($sp)
-; MIPS-NEXT: swr $4, 19($sp)
-; MIPS-NEXT: swr $1, 15($sp)
-; MIPS-NEXT: swr $1, 11($sp)
-; MIPS-NEXT: swr $1, 7($sp)
-; MIPS-NEXT: swr $1, 3($sp)
-; MIPS-NEXT: addiu $1, $2, 16
+; MIPS-NEXT: sw $7, 28($sp)
+; MIPS-NEXT: sw $6, 24($sp)
+; MIPS-NEXT: sw $5, 20($sp)
+; MIPS-NEXT: sw $4, 16($sp)
+; MIPS-NEXT: sw $1, 12($sp)
+; MIPS-NEXT: sw $1, 8($sp)
+; MIPS-NEXT: sw $1, 4($sp)
+; MIPS-NEXT: sw $1, 0($sp)
+; MIPS-NEXT: addiu $1, $sp, 0
+; MIPS-NEXT: addiu $1, $1, 16
; MIPS-NEXT: lw $2, 60($sp)
; MIPS-NEXT: srl $3, $2, 3
-; MIPS-NEXT: andi $3, $3, 15
+; MIPS-NEXT: andi $3, $3, 12
; MIPS-NEXT: subu $1, $1, $3
-; MIPS-NEXT: lwl $3, 4($1)
-; MIPS-NEXT: lwr $3, 7($1)
-; MIPS-NEXT: sll $4, $3, 1
-; MIPS-NEXT: lwl $5, 8($1)
-; MIPS-NEXT: lwr $5, 11($1)
-; MIPS-NEXT: andi $2, $2, 7
-; MIPS-NEXT: not $6, $2
-; MIPS-NEXT: srlv $7, $5, $2
-; MIPS-NEXT: sllv $4, $4, $6
+; MIPS-NEXT: lw $3, 4($1)
+; MIPS-NEXT: lw $5, 8($1)
+; MIPS-NEXT: srlv $4, $5, $2
+; MIPS-NEXT: sll $6, $3, 1
+; MIPS-NEXT: andi $7, $2, 31
+; MIPS-NEXT: xori $7, $7, 31
+; MIPS-NEXT: sllv $6, $6, $7
; MIPS-NEXT: srlv $3, $3, $2
-; MIPS-NEXT: lwl $6, 0($1)
-; MIPS-NEXT: lwr $6, 3($1)
-; MIPS-NEXT: sll $8, $6, 1
-; MIPS-NEXT: xori $9, $2, 31
-; MIPS-NEXT: sllv $8, $8, $9
-; MIPS-NEXT: or $3, $3, $8
-; MIPS-NEXT: or $4, $7, $4
-; MIPS-NEXT: lwl $7, 12($1)
-; MIPS-NEXT: lwr $7, 15($1)
-; MIPS-NEXT: srlv $1, $7, $2
+; MIPS-NEXT: lw $8, 0($1)
+; MIPS-NEXT: sll $9, $8, 1
+; MIPS-NEXT: sllv $9, $9, $7
+; MIPS-NEXT: or $3, $3, $9
+; MIPS-NEXT: or $4, $4, $6
+; MIPS-NEXT: lw $1, 12($1)
+; MIPS-NEXT: srlv $1, $1, $2
; MIPS-NEXT: sll $5, $5, 1
-; MIPS-NEXT: sllv $5, $5, $9
+; MIPS-NEXT: sllv $5, $5, $7
; MIPS-NEXT: or $5, $1, $5
-; MIPS-NEXT: srav $2, $6, $2
+; MIPS-NEXT: srav $2, $8, $2
; MIPS-NEXT: jr $ra
; MIPS-NEXT: addiu $sp, $sp, 32
;
@@ -436,53 +423,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $7, 28($sp)
-; MIPS32-NEXT: swl $6, 24($sp)
; MIPS32-NEXT: sra $1, $4, 31
-; MIPS32-NEXT: swl $5, 20($sp)
-; MIPS32-NEXT: swl $4, 16($sp)
-; MIPS32-NEXT: swl $1, 12($sp)
-; MIPS32-NEXT: swl $1, 8($sp)
-; MIPS32-NEXT: swl $1, 4($sp)
-; MIPS32-NEXT: swl $1, 0($sp)
-; MIPS32-NEXT: addiu $2, $sp, 0
-; MIPS32-NEXT: swr $7, 31($sp)
-; MIPS32-NEXT: swr $6, 27($sp)
-; MIPS32-NEXT: swr $5, 23($sp)
-; MIPS32-NEXT: swr $4, 19($sp)
-; MIPS32-NEXT: swr $1, 15($sp)
-; MIPS32-NEXT: swr $1, 11($sp)
-; MIPS32-NEXT: swr $1, 7($sp)
-; MIPS32-NEXT: swr $1, 3($sp)
-; MIPS32-NEXT: addiu $1, $2, 16
+; MIPS32-NEXT: sw $7, 28($sp)
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $5, 20($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
+; MIPS32-NEXT: sw $1, 12($sp)
+; MIPS32-NEXT: sw $1, 8($sp)
+; MIPS32-NEXT: sw $1, 4($sp)
+; MIPS32-NEXT: sw $1, 0($sp)
+; MIPS32-NEXT: addiu $1, $sp, 0
+; MIPS32-NEXT: addiu $1, $1, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
-; MIPS32-NEXT: andi $3, $3, 15
+; MIPS32-NEXT: andi $3, $3, 12
; MIPS32-NEXT: subu $1, $1, $3
-; MIPS32-NEXT: lwl $3, 4($1)
-; MIPS32-NEXT: lwr $3, 7($1)
-; MIPS32-NEXT: sll $4, $3, 1
-; MIPS32-NEXT: lwl $5, 8($1)
-; MIPS32-NEXT: lwr $5, 11($1)
-; MIPS32-NEXT: andi $2, $2, 7
-; MIPS32-NEXT: not $6, $2
-; MIPS32-NEXT: srlv $7, $5, $2
-; MIPS32-NEXT: sllv $4, $4, $6
+; MIPS32-NEXT: lw $3, 4($1)
+; MIPS32-NEXT: lw $5, 8($1)
+; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: sll $6, $3, 1
+; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: sllv $6, $6, $7
; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lwl $6, 0($1)
-; MIPS32-NEXT: lwr $6, 3($1)
-; MIPS32-NEXT: sll $8, $6, 1
-; MIPS32-NEXT: xori $9, $2, 31
-; MIPS32-NEXT: sllv $8, $8, $9
-; MIPS32-NEXT: or $3, $3, $8
-; MIPS32-NEXT: or $4, $7, $4
-; MIPS32-NEXT: lwl $7, 12($1)
-; MIPS32-NEXT: lwr $7, 15($1)
-; MIPS32-NEXT: srlv $1, $7, $2
+; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: sll $9, $8, 1
+; MIPS32-NEXT: sllv $9, $9, $7
+; MIPS32-NEXT: or $3, $3, $9
+; MIPS32-NEXT: or $4, $4, $6
+; MIPS32-NEXT: lw $1, 12($1)
+; MIPS32-NEXT: srlv $1, $1, $2
; MIPS32-NEXT: sll $5, $5, 1
-; MIPS32-NEXT: sllv $5, $5, $9
+; MIPS32-NEXT: sllv $5, $5, $7
; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srav $2, $6, $2
+; MIPS32-NEXT: srav $2, $8, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -490,52 +464,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R2: # %bb.0: # %entry
; 32R2-NEXT: addiu $sp, $sp, -32
; 32R2-NEXT: .cfi_def_cfa_offset 32
-; 32R2-NEXT: swl $7, 28($sp)
-; 32R2-NEXT: swl $6, 24($sp)
-; 32R2-NEXT: swl $5, 20($sp)
; 32R2-NEXT: sra $1, $4, 31
-; 32R2-NEXT: swl $4, 16($sp)
-; 32R2-NEXT: swl $1, 12($sp)
-; 32R2-NEXT: swl $1, 8($sp)
-; 32R2-NEXT: swl $1, 4($sp)
-; 32R2-NEXT: swl $1, 0($sp)
-; 32R2-NEXT: swr $7, 31($sp)
-; 32R2-NEXT: swr $6, 27($sp)
-; 32R2-NEXT: swr $5, 23($sp)
-; 32R2-NEXT: swr $4, 19($sp)
-; 32R2-NEXT: swr $1, 15($sp)
-; 32R2-NEXT: swr $1, 11($sp)
-; 32R2-NEXT: swr $1, 7($sp)
-; 32R2-NEXT: swr $1, 3($sp)
+; 32R2-NEXT: sw $7, 28($sp)
+; 32R2-NEXT: sw $6, 24($sp)
+; 32R2-NEXT: sw $5, 20($sp)
+; 32R2-NEXT: sw $4, 16($sp)
+; 32R2-NEXT: sw $1, 12($sp)
+; 32R2-NEXT: sw $1, 8($sp)
+; 32R2-NEXT: sw $1, 4($sp)
+; 32R2-NEXT: sw $1, 0($sp)
; 32R2-NEXT: addiu $1, $sp, 0
; 32R2-NEXT: addiu $1, $1, 16
; 32R2-NEXT: lw $2, 60($sp)
-; 32R2-NEXT: ext $3, $2, 3, 4
+; 32R2-NEXT: srl $3, $2, 3
+; 32R2-NEXT: andi $3, $3, 12
; 32R2-NEXT: subu $1, $1, $3
-; 32R2-NEXT: lwl $3, 4($1)
-; 32R2-NEXT: lwr $3, 7($1)
-; 32R2-NEXT: sll $4, $3, 1
-; 32R2-NEXT: lwl $5, 8($1)
-; 32R2-NEXT: lwr $5, 11($1)
-; 32R2-NEXT: andi $2, $2, 7
-; 32R2-NEXT: not $6, $2
-; 32R2-NEXT: srlv $7, $5, $2
-; 32R2-NEXT: sllv $4, $4, $6
+; 32R2-NEXT: lw $3, 4($1)
+; 32R2-NEXT: lw $5, 8($1)
+; 32R2-NEXT: srlv $4, $5, $2
+; 32R2-NEXT: sll $6, $3, 1
+; 32R2-NEXT: andi $7, $2, 31
+; 32R2-NEXT: xori $7, $7, 31
+; 32R2-NEXT: sllv $6, $6, $7
; 32R2-NEXT: srlv $3, $3, $2
-; 32R2-NEXT: lwl $6, 0($1)
-; 32R2-NEXT: lwr $6, 3($1)
-; 32R2-NEXT: sll $8, $6, 1
-; 32R2-NEXT: xori $9, $2, 31
-; 32R2-NEXT: sllv $8, $8, $9
-; 32R2-NEXT: or $3, $3, $8
-; 32R2-NEXT: or $4, $7, $4
-; 32R2-NEXT: lwl $7, 12($1)
-; 32R2-NEXT: lwr $7, 15($1)
-; 32R2-NEXT: srlv $1, $7, $2
+; 32R2-NEXT: lw $8, 0($1)
+; 32R2-NEXT: sll $9, $8, 1
+; 32R2-NEXT: sllv $9, $9, $7
+; 32R2-NEXT: or $3, $3, $9
+; 32R2-NEXT: or $4, $4, $6
+; 32R2-NEXT: lw $1, 12($1)
+; 32R2-NEXT: srlv $1, $1, $2
; 32R2-NEXT: sll $5, $5, 1
-; 32R2-NEXT: sllv $5, $5, $9
+; 32R2-NEXT: sllv $5, $5, $7
; 32R2-NEXT: or $5, $1, $5
-; 32R2-NEXT: srav $2, $6, $2
+; 32R2-NEXT: srav $2, $8, $2
; 32R2-NEXT: jr $ra
; 32R2-NEXT: addiu $sp, $sp, 32
;
@@ -555,28 +517,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; 32R6-NEXT: addiu $1, $sp, 0
; 32R6-NEXT: addiu $1, $1, 16
; 32R6-NEXT: lw $2, 60($sp)
-; 32R6-NEXT: ext $3, $2, 3, 4
+; 32R6-NEXT: srl $3, $2, 3
+; 32R6-NEXT: andi $3, $3, 12
; 32R6-NEXT: subu $1, $1, $3
; 32R6-NEXT: lw $3, 4($1)
-; 32R6-NEXT: sll $4, $3, 1
; 32R6-NEXT: lw $5, 8($1)
-; 32R6-NEXT: andi $2, $2, 7
-; 32R6-NEXT: not $6, $2
-; 32R6-NEXT: srlv $7, $5, $2
-; 32R6-NEXT: sllv $4, $4, $6
+; 32R6-NEXT: srlv $4, $5, $2
+; 32R6-NEXT: sll $6, $3, 1
+; 32R6-NEXT: andi $7, $2, 31
+; 32R6-NEXT: xori $7, $7, 31
+; 32R6-NEXT: sllv $6, $6, $7
; 32R6-NEXT: srlv $3, $3, $2
-; 32R6-NEXT: lw $6, 0($1)
-; 32R6-NEXT: sll $8, $6, 1
-; 32R6-NEXT: xori $9, $2, 31
-; 32R6-NEXT: sllv $8, $8, $9
-; 32R6-NEXT: or $3, $3, $8
-; 32R6-NEXT: or $4, $7, $4
+; 32R6-NEXT: lw $8, 0($1)
+; 32R6-NEXT: sll $9, $8, 1
+; 32R6-NEXT: sllv $9, $9, $7
+; 32R6-NEXT: or $3, $3, $9
+; 32R6-NEXT: or $4, $4, $6
; 32R6-NEXT: lw $1, 12($1)
; 32R6-NEXT: srlv $1, $1, $2
; 32R6-NEXT: sll $5, $5, 1
-; 32R6-NEXT: sllv $5, $5, $9
+; 32R6-NEXT: sllv $5, $5, $7
; 32R6-NEXT: or $5, $1, $5
-; 32R6-NEXT: srav $2, $6, $2
+; 32R6-NEXT: srav $2, $8, $2
; 32R6-NEXT: jr $ra
; 32R6-NEXT: addiu $sp, $sp, 32
;
@@ -656,53 +618,37 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: swl $7, 28($sp)
-; MMR3-NEXT: swl $6, 24($sp)
-; MMR3-NEXT: swl $5, 20($sp)
; MMR3-NEXT: sra $1, $4, 31
-; MMR3-NEXT: swl $4, 16($sp)
-; MMR3-NEXT: swl $1, 12($sp)
-; MMR3-NEXT: swl $1, 8($sp)
-; MMR3-NEXT: swl $1, 4($sp)
-; MMR3-NEXT: swl $1, 0($sp)
-; MMR3-NEXT: swr $7, 31($sp)
-; MMR3-NEXT: swr $6, 27($sp)
-; MMR3-NEXT: swr $5, 23($sp)
-; MMR3-NEXT: swr $4, 19($sp)
-; MMR3-NEXT: swr $1, 15($sp)
-; MMR3-NEXT: swr $1, 11($sp)
-; MMR3-NEXT: swr $1, 7($sp)
-; MMR3-NEXT: swr $1, 3($sp)
+; MMR3-NEXT: swp $6, 24($sp)
+; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $1, 12($sp)
+; MMR3-NEXT: sw $1, 8($sp)
+; MMR3-NEXT: sw $1, 4($sp)
+; MMR3-NEXT: sw $1, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
-; MMR3-NEXT: ext $4, $3, 3, 4
-; MMR3-NEXT: subu16 $2, $2, $4
-; MMR3-NEXT: lwl $7, 4($2)
-; MMR3-NEXT: lwr $7, 7($2)
-; MMR3-NEXT: sll16 $4, $7, 1
-; MMR3-NEXT: lwl $5, 8($2)
-; MMR3-NEXT: lwr $5, 11($2)
-; MMR3-NEXT: andi16 $6, $3, 7
-; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: andi16 $3, $3, 31
-; MMR3-NEXT: srlv $16, $5, $6
-; MMR3-NEXT: sllv $4, $4, $3
-; MMR3-NEXT: srlv $17, $7, $6
-; MMR3-NEXT: lwl $7, 0($2)
-; MMR3-NEXT: lwr $7, 3($2)
-; MMR3-NEXT: sll16 $3, $7, 1
-; MMR3-NEXT: xori $1, $6, 31
+; MMR3-NEXT: srl16 $4, $3, 3
+; MMR3-NEXT: andi $4, $4, 12
+; MMR3-NEXT: subu16 $5, $2, $4
+; MMR3-NEXT: lwp $6, 4($5)
+; MMR3-NEXT: andi16 $2, $3, 31
+; MMR3-NEXT: srlv $16, $7, $2
+; MMR3-NEXT: sll16 $3, $6, 1
+; MMR3-NEXT: xori $1, $2, 31
+; MMR3-NEXT: sllv $4, $3, $1
+; MMR3-NEXT: srlv $6, $6, $2
+; MMR3-NEXT: lw16 $17, 0($5)
+; MMR3-NEXT: sll16 $3, $17, 1
; MMR3-NEXT: sllv $3, $3, $1
-; MMR3-NEXT: or16 $3, $17
+; MMR3-NEXT: or16 $3, $6
; MMR3-NEXT: or16 $4, $16
-; MMR3-NEXT: lwl $8, 12($2)
-; MMR3-NEXT: lwr $8, 15($2)
-; MMR3-NEXT: srlv $2, $8, $6
-; MMR3-NEXT: sll16 $5, $5, 1
+; MMR3-NEXT: lw16 $5, 12($5)
+; MMR3-NEXT: srlv $6, $5, $2
+; MMR3-NEXT: sll16 $5, $7, 1
; MMR3-NEXT: sllv $5, $5, $1
-; MMR3-NEXT: or16 $5, $2
-; MMR3-NEXT: srav $2, $7, $6
+; MMR3-NEXT: or16 $5, $6
+; MMR3-NEXT: srav $2, $17, $2
; MMR3-NEXT: lwp $16, 32($sp)
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
@@ -714,40 +660,39 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 16, -4
; MMR6-NEXT: sra $1, $4, 31
-; MMR6-NEXT: sw $7, 32($sp)
-; MMR6-NEXT: sw $6, 28($sp)
-; MMR6-NEXT: sw $5, 24($sp)
-; MMR6-NEXT: sw $4, 20($sp)
-; MMR6-NEXT: sw $1, 16($sp)
+; MMR6-NEXT: sw $7, 28($sp)
+; MMR6-NEXT: sw $6, 24($sp)
+; MMR6-NEXT: sw $5, 20($sp)
+; MMR6-NEXT: sw $4, 16($sp)
; MMR6-NEXT: sw $1, 12($sp)
; MMR6-NEXT: sw $1, 8($sp)
; MMR6-NEXT: sw $1, 4($sp)
-; MMR6-NEXT: addiu $2, $sp, 4
+; MMR6-NEXT: sw $1, 0($sp)
+; MMR6-NEXT: addiu $2, $sp, 0
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
-; MMR6-NEXT: ext $4, $3, 3, 4
-; MMR6-NEXT: subu16 $5, $2, $4
-; MMR6-NEXT: lw16 $4, 4($5)
-; MMR6-NEXT: sll16 $6, $4, 1
-; MMR6-NEXT: lw16 $7, 8($5)
-; MMR6-NEXT: andi16 $2, $3, 7
-; MMR6-NEXT: not16 $3, $2
-; MMR6-NEXT: andi16 $3, $3, 31
-; MMR6-NEXT: srlv $1, $7, $2
-; MMR6-NEXT: sllv $6, $6, $3
-; MMR6-NEXT: srlv $3, $4, $2
-; MMR6-NEXT: lw16 $16, 0($5)
+; MMR6-NEXT: srl16 $4, $3, 3
+; MMR6-NEXT: andi $4, $4, 12
+; MMR6-NEXT: subu16 $2, $2, $4
+; MMR6-NEXT: lw16 $4, 4($2)
+; MMR6-NEXT: lw16 $5, 8($2)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: srlv $1, $5, $6
+; MMR6-NEXT: sll16 $3, $4, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: sllv $8, $3, $7
+; MMR6-NEXT: srlv $3, $4, $6
+; MMR6-NEXT: lw16 $16, 0($2)
; MMR6-NEXT: sll16 $4, $16, 1
-; MMR6-NEXT: xori $8, $2, 31
-; MMR6-NEXT: sllv $4, $4, $8
+; MMR6-NEXT: sllv $4, $4, $7
; MMR6-NEXT: or $3, $3, $4
-; MMR6-NEXT: or $4, $1, $6
-; MMR6-NEXT: lw16 $5, 12($5)
-; MMR6-NEXT: srlv $1, $5, $2
-; MMR6-NEXT: sll16 $5, $7, 1
-; MMR6-NEXT: sllv $5, $5, $8
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: srav $2, $16, $2
+; MMR6-NEXT: or $4, $1, $8
+; MMR6-NEXT: lw16 $2, 12($2)
+; MMR6-NEXT: srlv $1, $2, $6
+; MMR6-NEXT: sll16 $2, $5, 1
+; MMR6-NEXT: sllv $2, $2, $7
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: srav $2, $16, $6
; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
index 03cf104e3120..69b842c73db1 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -398,52 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: swl $7, 28($sp)
-; MIPS2-NEXT: swl $6, 24($sp)
-; MIPS2-NEXT: swl $5, 20($sp)
-; MIPS2-NEXT: swl $4, 16($sp)
-; MIPS2-NEXT: swl $zero, 12($sp)
-; MIPS2-NEXT: swl $zero, 8($sp)
-; MIPS2-NEXT: swl $zero, 4($sp)
-; MIPS2-NEXT: swl $zero, 0($sp)
; MIPS2-NEXT: addiu $1, $sp, 0
-; MIPS2-NEXT: swr $7, 31($sp)
-; MIPS2-NEXT: swr $6, 27($sp)
-; MIPS2-NEXT: swr $5, 23($sp)
-; MIPS2-NEXT: swr $4, 19($sp)
-; MIPS2-NEXT: swr $zero, 15($sp)
-; MIPS2-NEXT: swr $zero, 11($sp)
-; MIPS2-NEXT: swr $zero, 7($sp)
-; MIPS2-NEXT: swr $zero, 3($sp)
+; MIPS2-NEXT: sw $7, 28($sp)
+; MIPS2-NEXT: sw $6, 24($sp)
+; MIPS2-NEXT: sw $5, 20($sp)
+; MIPS2-NEXT: sw $4, 16($sp)
; MIPS2-NEXT: addiu $1, $1, 16
; MIPS2-NEXT: lw $2, 60($sp)
; MIPS2-NEXT: srl $3, $2, 3
-; MIPS2-NEXT: andi $3, $3, 15
+; MIPS2-NEXT: andi $3, $3, 12
; MIPS2-NEXT: subu $1, $1, $3
-; MIPS2-NEXT: lwl $3, 4($1)
-; MIPS2-NEXT: lwr $3, 7($1)
-; MIPS2-NEXT: sll $4, $3, 1
-; MIPS2-NEXT: lwl $5, 8($1)
-; MIPS2-NEXT: lwr $5, 11($1)
-; MIPS2-NEXT: andi $2, $2, 7
-; MIPS2-NEXT: not $6, $2
-; MIPS2-NEXT: srlv $7, $5, $2
-; MIPS2-NEXT: sllv $4, $4, $6
+; MIPS2-NEXT: sw $zero, 12($sp)
+; MIPS2-NEXT: sw $zero, 8($sp)
+; MIPS2-NEXT: sw $zero, 4($sp)
+; MIPS2-NEXT: sw $zero, 0($sp)
+; MIPS2-NEXT: lw $3, 4($1)
+; MIPS2-NEXT: lw $5, 8($1)
+; MIPS2-NEXT: srlv $4, $5, $2
+; MIPS2-NEXT: sll $6, $3, 1
+; MIPS2-NEXT: andi $7, $2, 31
+; MIPS2-NEXT: xori $7, $7, 31
+; MIPS2-NEXT: sllv $6, $6, $7
; MIPS2-NEXT: srlv $3, $3, $2
-; MIPS2-NEXT: lwl $6, 0($1)
-; MIPS2-NEXT: lwr $6, 3($1)
-; MIPS2-NEXT: sll $8, $6, 1
-; MIPS2-NEXT: xori $9, $2, 31
-; MIPS2-NEXT: sllv $8, $8, $9
-; MIPS2-NEXT: or $3, $3, $8
-; MIPS2-NEXT: or $4, $7, $4
-; MIPS2-NEXT: lwl $7, 12($1)
-; MIPS2-NEXT: lwr $7, 15($1)
-; MIPS2-NEXT: srlv $1, $7, $2
+; MIPS2-NEXT: lw $8, 0($1)
+; MIPS2-NEXT: sll $9, $8, 1
+; MIPS2-NEXT: sllv $9, $9, $7
+; MIPS2-NEXT: or $3, $3, $9
+; MIPS2-NEXT: or $4, $4, $6
+; MIPS2-NEXT: lw $1, 12($1)
+; MIPS2-NEXT: srlv $1, $1, $2
; MIPS2-NEXT: sll $5, $5, 1
-; MIPS2-NEXT: sllv $5, $5, $9
+; MIPS2-NEXT: sllv $5, $5, $7
; MIPS2-NEXT: or $5, $1, $5
-; MIPS2-NEXT: srlv $2, $6, $2
+; MIPS2-NEXT: srlv $2, $8, $2
; MIPS2-NEXT: jr $ra
; MIPS2-NEXT: addiu $sp, $sp, 32
;
@@ -451,52 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $7, 28($sp)
-; MIPS32-NEXT: swl $6, 24($sp)
-; MIPS32-NEXT: swl $5, 20($sp)
-; MIPS32-NEXT: swl $4, 16($sp)
-; MIPS32-NEXT: swl $zero, 12($sp)
-; MIPS32-NEXT: swl $zero, 8($sp)
-; MIPS32-NEXT: swl $zero, 4($sp)
-; MIPS32-NEXT: swl $zero, 0($sp)
; MIPS32-NEXT: addiu $1, $sp, 0
-; MIPS32-NEXT: swr $7, 31($sp)
-; MIPS32-NEXT: swr $6, 27($sp)
-; MIPS32-NEXT: swr $5, 23($sp)
-; MIPS32-NEXT: swr $4, 19($sp)
-; MIPS32-NEXT: swr $zero, 15($sp)
-; MIPS32-NEXT: swr $zero, 11($sp)
-; MIPS32-NEXT: swr $zero, 7($sp)
-; MIPS32-NEXT: swr $zero, 3($sp)
+; MIPS32-NEXT: sw $7, 28($sp)
+; MIPS32-NEXT: sw $6, 24($sp)
+; MIPS32-NEXT: sw $5, 20($sp)
+; MIPS32-NEXT: sw $4, 16($sp)
; MIPS32-NEXT: addiu $1, $1, 16
; MIPS32-NEXT: lw $2, 60($sp)
; MIPS32-NEXT: srl $3, $2, 3
-; MIPS32-NEXT: andi $3, $3, 15
+; MIPS32-NEXT: andi $3, $3, 12
; MIPS32-NEXT: subu $1, $1, $3
-; MIPS32-NEXT: lwl $3, 4($1)
-; MIPS32-NEXT: lwr $3, 7($1)
-; MIPS32-NEXT: sll $4, $3, 1
-; MIPS32-NEXT: lwl $5, 8($1)
-; MIPS32-NEXT: lwr $5, 11($1)
-; MIPS32-NEXT: andi $2, $2, 7
-; MIPS32-NEXT: not $6, $2
-; MIPS32-NEXT: srlv $7, $5, $2
-; MIPS32-NEXT: sllv $4, $4, $6
+; MIPS32-NEXT: sw $zero, 12($sp)
+; MIPS32-NEXT: sw $zero, 8($sp)
+; MIPS32-NEXT: sw $zero, 4($sp)
+; MIPS32-NEXT: sw $zero, 0($sp)
+; MIPS32-NEXT: lw $3, 4($1)
+; MIPS32-NEXT: lw $5, 8($1)
+; MIPS32-NEXT: srlv $4, $5, $2
+; MIPS32-NEXT: sll $6, $3, 1
+; MIPS32-NEXT: andi $7, $2, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: sllv $6, $6, $7
; MIPS32-NEXT: srlv $3, $3, $2
-; MIPS32-NEXT: lwl $6, 0($1)
-; MIPS32-NEXT: lwr $6, 3($1)
-; MIPS32-NEXT: sll $8, $6, 1
-; MIPS32-NEXT: xori $9, $2, 31
-; MIPS32-NEXT: sllv $8, $8, $9
-; MIPS32-NEXT: or $3, $3, $8
-; MIPS32-NEXT: or $4, $7, $4
-; MIPS32-NEXT: lwl $7, 12($1)
-; MIPS32-NEXT: lwr $7, 15($1)
-; MIPS32-NEXT: srlv $1, $7, $2
+; MIPS32-NEXT: lw $8, 0($1)
+; MIPS32-NEXT: sll $9, $8, 1
+; MIPS32-NEXT: sllv $9, $9, $7
+; MIPS32-NEXT: or $3, $3, $9
+; MIPS32-NEXT: or $4, $4, $6
+; MIPS32-NEXT: lw $1, 12($1)
+; MIPS32-NEXT: srlv $1, $1, $2
; MIPS32-NEXT: sll $5, $5, 1
-; MIPS32-NEXT: sllv $5, $5, $9
+; MIPS32-NEXT: sllv $5, $5, $7
; MIPS32-NEXT: or $5, $1, $5
-; MIPS32-NEXT: srlv $2, $6, $2
+; MIPS32-NEXT: srlv $2, $8, $2
; MIPS32-NEXT: jr $ra
; MIPS32-NEXT: addiu $sp, $sp, 32
;
@@ -504,51 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: swl $7, 28($sp)
-; MIPS32R2-NEXT: swl $6, 24($sp)
-; MIPS32R2-NEXT: swl $5, 20($sp)
-; MIPS32R2-NEXT: swl $4, 16($sp)
-; MIPS32R2-NEXT: swl $zero, 12($sp)
-; MIPS32R2-NEXT: swl $zero, 8($sp)
-; MIPS32R2-NEXT: swl $zero, 4($sp)
-; MIPS32R2-NEXT: swl $zero, 0($sp)
-; MIPS32R2-NEXT: swr $7, 31($sp)
-; MIPS32R2-NEXT: swr $6, 27($sp)
-; MIPS32R2-NEXT: swr $5, 23($sp)
-; MIPS32R2-NEXT: swr $4, 19($sp)
-; MIPS32R2-NEXT: swr $zero, 15($sp)
-; MIPS32R2-NEXT: swr $zero, 11($sp)
-; MIPS32R2-NEXT: swr $zero, 7($sp)
-; MIPS32R2-NEXT: swr $zero, 3($sp)
; MIPS32R2-NEXT: addiu $1, $sp, 0
+; MIPS32R2-NEXT: sw $7, 28($sp)
+; MIPS32R2-NEXT: sw $6, 24($sp)
+; MIPS32R2-NEXT: sw $5, 20($sp)
+; MIPS32R2-NEXT: sw $4, 16($sp)
; MIPS32R2-NEXT: addiu $1, $1, 16
; MIPS32R2-NEXT: lw $2, 60($sp)
-; MIPS32R2-NEXT: ext $3, $2, 3, 4
+; MIPS32R2-NEXT: srl $3, $2, 3
+; MIPS32R2-NEXT: andi $3, $3, 12
; MIPS32R2-NEXT: subu $1, $1, $3
-; MIPS32R2-NEXT: lwl $3, 4($1)
-; MIPS32R2-NEXT: lwr $3, 7($1)
-; MIPS32R2-NEXT: sll $4, $3, 1
-; MIPS32R2-NEXT: lwl $5, 8($1)
-; MIPS32R2-NEXT: lwr $5, 11($1)
-; MIPS32R2-NEXT: andi $2, $2, 7
-; MIPS32R2-NEXT: not $6, $2
-; MIPS32R2-NEXT: srlv $7, $5, $2
-; MIPS32R2-NEXT: sllv $4, $4, $6
+; MIPS32R2-NEXT: sw $zero, 12($sp)
+; MIPS32R2-NEXT: sw $zero, 8($sp)
+; MIPS32R2-NEXT: sw $zero, 4($sp)
+; MIPS32R2-NEXT: sw $zero, 0($sp)
+; MIPS32R2-NEXT: lw $3, 4($1)
+; MIPS32R2-NEXT: lw $5, 8($1)
+; MIPS32R2-NEXT: srlv $4, $5, $2
+; MIPS32R2-NEXT: sll $6, $3, 1
+; MIPS32R2-NEXT: andi $7, $2, 31
+; MIPS32R2-NEXT: xori $7, $7, 31
+; MIPS32R2-NEXT: sllv $6, $6, $7
; MIPS32R2-NEXT: srlv $3, $3, $2
-; MIPS32R2-NEXT: lwl $6, 0($1)
-; MIPS32R2-NEXT: lwr $6, 3($1)
-; MIPS32R2-NEXT: sll $8, $6, 1
-; MIPS32R2-NEXT: xori $9, $2, 31
-; MIPS32R2-NEXT: sllv $8, $8, $9
-; MIPS32R2-NEXT: or $3, $3, $8
-; MIPS32R2-NEXT: or $4, $7, $4
-; MIPS32R2-NEXT: lwl $7, 12($1)
-; MIPS32R2-NEXT: lwr $7, 15($1)
-; MIPS32R2-NEXT: srlv $1, $7, $2
+; MIPS32R2-NEXT: lw $8, 0($1)
+; MIPS32R2-NEXT: sll $9, $8, 1
+; MIPS32R2-NEXT: sllv $9, $9, $7
+; MIPS32R2-NEXT: or $3, $3, $9
+; MIPS32R2-NEXT: or $4, $4, $6
+; MIPS32R2-NEXT: lw $1, 12($1)
+; MIPS32R2-NEXT: srlv $1, $1, $2
; MIPS32R2-NEXT: sll $5, $5, 1
-; MIPS32R2-NEXT: sllv $5, $5, $9
+; MIPS32R2-NEXT: sllv $5, $5, $7
; MIPS32R2-NEXT: or $5, $1, $5
-; MIPS32R2-NEXT: srlv $2, $6, $2
+; MIPS32R2-NEXT: srlv $2, $8, $2
; MIPS32R2-NEXT: jr $ra
; MIPS32R2-NEXT: addiu $sp, $sp, 32
;
@@ -563,32 +525,32 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: sw $4, 16($sp)
; MIPS32R6-NEXT: addiu $1, $1, 16
; MIPS32R6-NEXT: lw $2, 60($sp)
-; MIPS32R6-NEXT: ext $3, $2, 3, 4
+; MIPS32R6-NEXT: srl $3, $2, 3
+; MIPS32R6-NEXT: andi $3, $3, 12
; MIPS32R6-NEXT: subu $1, $1, $3
; MIPS32R6-NEXT: sw $zero, 12($sp)
; MIPS32R6-NEXT: sw $zero, 8($sp)
; MIPS32R6-NEXT: sw $zero, 4($sp)
; MIPS32R6-NEXT: sw $zero, 0($sp)
; MIPS32R6-NEXT: lw $3, 4($1)
-; MIPS32R6-NEXT: sll $4, $3, 1
; MIPS32R6-NEXT: lw $5, 8($1)
-; MIPS32R6-NEXT: andi $2, $2, 7
-; MIPS32R6-NEXT: not $6, $2
-; MIPS32R6-NEXT: srlv $7, $5, $2
-; MIPS32R6-NEXT: sllv $4, $4, $6
+; MIPS32R6-NEXT: srlv $4, $5, $2
+; MIPS32R6-NEXT: sll $6, $3, 1
+; MIPS32R6-NEXT: andi $7, $2, 31
+; MIPS32R6-NEXT: xori $7, $7, 31
+; MIPS32R6-NEXT: sllv $6, $6, $7
; MIPS32R6-NEXT: srlv $3, $3, $2
-; MIPS32R6-NEXT: lw $6, 0($1)
-; MIPS32R6-NEXT: sll $8, $6, 1
-; MIPS32R6-NEXT: xori $9, $2, 31
-; MIPS32R6-NEXT: sllv $8, $8, $9
-; MIPS32R6-NEXT: or $3, $3, $8
-; MIPS32R6-NEXT: or $4, $7, $4
+; MIPS32R6-NEXT: lw $8, 0($1)
+; MIPS32R6-NEXT: sll $9, $8, 1
+; MIPS32R6-NEXT: sllv $9, $9, $7
+; MIPS32R6-NEXT: or $3, $3, $9
+; MIPS32R6-NEXT: or $4, $4, $6
; MIPS32R6-NEXT: lw $1, 12($1)
; MIPS32R6-NEXT: srlv $1, $1, $2
; MIPS32R6-NEXT: sll $5, $5, 1
-; MIPS32R6-NEXT: sllv $5, $5, $9
+; MIPS32R6-NEXT: sllv $5, $5, $7
; MIPS32R6-NEXT: or $5, $1, $5
-; MIPS32R6-NEXT: srlv $2, $6, $2
+; MIPS32R6-NEXT: srlv $2, $8, $2
; MIPS32R6-NEXT: jr $ra
; MIPS32R6-NEXT: addiu $sp, $sp, 32
;
@@ -677,53 +639,37 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: swp $16, 32($sp)
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
-; MMR3-NEXT: swl $7, 28($sp)
-; MMR3-NEXT: swl $6, 24($sp)
-; MMR3-NEXT: swl $5, 20($sp)
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swl $4, 16($sp)
-; MMR3-NEXT: swl $2, 12($sp)
-; MMR3-NEXT: swl $2, 8($sp)
-; MMR3-NEXT: swl $2, 4($sp)
-; MMR3-NEXT: swl $2, 0($sp)
-; MMR3-NEXT: swr $7, 31($sp)
-; MMR3-NEXT: swr $6, 27($sp)
-; MMR3-NEXT: swr $5, 23($sp)
-; MMR3-NEXT: swr $4, 19($sp)
-; MMR3-NEXT: swr $2, 15($sp)
-; MMR3-NEXT: swr $2, 11($sp)
-; MMR3-NEXT: swr $2, 7($sp)
-; MMR3-NEXT: swr $2, 3($sp)
+; MMR3-NEXT: swp $6, 24($sp)
+; MMR3-NEXT: swp $4, 16($sp)
+; MMR3-NEXT: sw $2, 12($sp)
+; MMR3-NEXT: sw $2, 8($sp)
+; MMR3-NEXT: sw $2, 4($sp)
+; MMR3-NEXT: sw $2, 0($sp)
; MMR3-NEXT: addiur1sp $2, 0
; MMR3-NEXT: addiur2 $2, $2, 16
; MMR3-NEXT: lw $3, 68($sp)
-; MMR3-NEXT: ext $4, $3, 3, 4
-; MMR3-NEXT: subu16 $2, $2, $4
-; MMR3-NEXT: lwl $7, 4($2)
-; MMR3-NEXT: lwr $7, 7($2)
-; MMR3-NEXT: sll16 $4, $7, 1
-; MMR3-NEXT: lwl $5, 8($2)
-; MMR3-NEXT: lwr $5, 11($2)
-; MMR3-NEXT: andi16 $6, $3, 7
-; MMR3-NEXT: not16 $3, $6
-; MMR3-NEXT: andi16 $3, $3, 31
-; MMR3-NEXT: srlv $16, $5, $6
-; MMR3-NEXT: sllv $4, $4, $3
-; MMR3-NEXT: srlv $17, $7, $6
-; MMR3-NEXT: lwl $7, 0($2)
-; MMR3-NEXT: lwr $7, 3($2)
-; MMR3-NEXT: sll16 $3, $7, 1
-; MMR3-NEXT: xori $1, $6, 31
+; MMR3-NEXT: srl16 $4, $3, 3
+; MMR3-NEXT: andi $4, $4, 12
+; MMR3-NEXT: subu16 $5, $2, $4
+; MMR3-NEXT: lwp $6, 4($5)
+; MMR3-NEXT: andi16 $2, $3, 31
+; MMR3-NEXT: srlv $16, $7, $2
+; MMR3-NEXT: sll16 $3, $6, 1
+; MMR3-NEXT: xori $1, $2, 31
+; MMR3-NEXT: sllv $4, $3, $1
+; MMR3-NEXT: srlv $6, $6, $2
+; MMR3-NEXT: lw16 $17, 0($5)
+; MMR3-NEXT: sll16 $3, $17, 1
; MMR3-NEXT: sllv $3, $3, $1
-; MMR3-NEXT: or16 $3, $17
+; MMR3-NEXT: or16 $3, $6
; MMR3-NEXT: or16 $4, $16
-; MMR3-NEXT: lwl $8, 12($2)
-; MMR3-NEXT: lwr $8, 15($2)
-; MMR3-NEXT: srlv $2, $8, $6
-; MMR3-NEXT: sll16 $5, $5, 1
+; MMR3-NEXT: lw16 $5, 12($5)
+; MMR3-NEXT: srlv $6, $5, $2
+; MMR3-NEXT: sll16 $5, $7, 1
; MMR3-NEXT: sllv $5, $5, $1
-; MMR3-NEXT: or16 $5, $2
-; MMR3-NEXT: srlv $2, $7, $6
+; MMR3-NEXT: or16 $5, $6
+; MMR3-NEXT: srlv $2, $17, $2
; MMR3-NEXT: lwp $16, 32($sp)
; MMR3-NEXT: addiusp 40
; MMR3-NEXT: jrc $ra
@@ -735,40 +681,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill
; MMR6-NEXT: .cfi_offset 16, -4
; MMR6-NEXT: li16 $2, 0
-; MMR6-NEXT: sw $7, 32($sp)
-; MMR6-NEXT: sw $6, 28($sp)
-; MMR6-NEXT: sw $5, 24($sp)
-; MMR6-NEXT: sw $4, 20($sp)
-; MMR6-NEXT: sw $2, 16($sp)
+; MMR6-NEXT: sw $7, 28($sp)
+; MMR6-NEXT: sw $6, 24($sp)
+; MMR6-NEXT: sw $5, 20($sp)
+; MMR6-NEXT: sw $4, 16($sp)
; MMR6-NEXT: sw $2, 12($sp)
; MMR6-NEXT: sw $2, 8($sp)
; MMR6-NEXT: sw $2, 4($sp)
-; MMR6-NEXT: addiu $2, $sp, 4
+; MMR6-NEXT: sw $2, 0($sp)
+; MMR6-NEXT: addiu $2, $sp, 0
; MMR6-NEXT: addiur2 $2, $2, 16
; MMR6-NEXT: lw $3, 68($sp)
-; MMR6-NEXT: ext $4, $3, 3, 4
-; MMR6-NEXT: subu16 $5, $2, $4
-; MMR6-NEXT: lw16 $4, 4($5)
-; MMR6-NEXT: sll16 $6, $4, 1
-; MMR6-NEXT: lw16 $7, 8($5)
-; MMR6-NEXT: andi16 $2, $3, 7
-; MMR6-NEXT: not16 $3, $2
-; MMR6-NEXT: andi16 $3, $3, 31
-; MMR6-NEXT: srlv $1, $7, $2
-; MMR6-NEXT: sllv $6, $6, $3
-; MMR6-NEXT: srlv $3, $4, $2
-; MMR6-NEXT: lw16 $16, 0($5)
+; MMR6-NEXT: srl16 $4, $3, 3
+; MMR6-NEXT: andi $4, $4, 12
+; MMR6-NEXT: subu16 $2, $2, $4
+; MMR6-NEXT: lw16 $4, 4($2)
+; MMR6-NEXT: lw16 $5, 8($2)
+; MMR6-NEXT: andi16 $6, $3, 31
+; MMR6-NEXT: srlv $1, $5, $6
+; MMR6-NEXT: sll16 $3, $4, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: sllv $8, $3, $7
+; MMR6-NEXT: srlv $3, $4, $6
+; MMR6-NEXT: lw16 $16, 0($2)
; MMR6-NEXT: sll16 $4, $16, 1
-; MMR6-NEXT: xori $8, $2, 31
-; MMR6-NEXT: sllv $4, $4, $8
+; MMR6-NEXT: sllv $4, $4, $7
; MMR6-NEXT: or $3, $3, $4
-; MMR6-NEXT: or $4, $1, $6
-; MMR6-NEXT: lw16 $5, 12($5)
-; MMR6-NEXT: srlv $1, $5, $2
-; MMR6-NEXT: sll16 $5, $7, 1
-; MMR6-NEXT: sllv $5, $5, $8
-; MMR6-NEXT: or $5, $1, $5
-; MMR6-NEXT: srlv $2, $16, $2
+; MMR6-NEXT: or $4, $1, $8
+; MMR6-NEXT: lw16 $2, 12($2)
+; MMR6-NEXT: srlv $1, $2, $6
+; MMR6-NEXT: sll16 $2, $5, 1
+; MMR6-NEXT: sllv $2, $2, $7
+; MMR6-NEXT: or $5, $1, $2
+; MMR6-NEXT: srlv $2, $16, $6
; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload
; MMR6-NEXT: addiu $sp, $sp, 40
; MMR6-NEXT: jrc $ra
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll
index af3d4f50f3fe..8d548861f439 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2
; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \
@@ -13,9 +13,9 @@
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6
; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \
@@ -35,6 +35,11 @@
; RUN: FileCheck %s -check-prefix=MMR6
define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) {
+; MIPS2-LABEL: sdiv_i1:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: move $2, $4
+;
; GP32-LABEL: sdiv_i1:
; GP32: # %bb.0: # %entry
; GP32-NEXT: jr $ra
@@ -45,6 +50,11 @@ define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: move $2, $4
;
+; MIPS3-LABEL: sdiv_i1:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: move $2, $4
+;
; GP64-LABEL: sdiv_i1:
; GP64: # %bb.0: # %entry
; GP64-NEXT: jr $ra
@@ -70,6 +80,15 @@ entry:
}
define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) {
+; MIPS2-LABEL: sdiv_i8:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $1
+; MIPS2-NEXT: sll $1, $1, 24
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: sra $2, $1, 24
+;
; GP32R0R2-LABEL: sdiv_i8:
; GP32R0R2: # %bb.0: # %entry
; GP32R0R2-NEXT: div $zero, $4, $5
@@ -94,6 +113,15 @@ define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: seb $2, $1
;
+; MIPS3-LABEL: sdiv_i8:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: div $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $1
+; MIPS3-NEXT: sll $1, $1, 24
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: sra $2, $1, 24
+;
; GP64R0R1-LABEL: sdiv_i8:
; GP64R0R1: # %bb.0: # %entry
; GP64R0R1-NEXT: div $zero, $4, $5
@@ -138,6 +166,15 @@ entry:
}
define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) {
+; MIPS2-LABEL: sdiv_i16:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $1
+; MIPS2-NEXT: sll $1, $1, 16
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: sra $2, $1, 16
+;
; GP32R0R2-LABEL: sdiv_i16:
; GP32R0R2: # %bb.0: # %entry
; GP32R0R2-NEXT: div $zero, $4, $5
@@ -162,6 +199,15 @@ define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: seh $2, $1
;
+; MIPS3-LABEL: sdiv_i16:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: div $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $1
+; MIPS3-NEXT: sll $1, $1, 16
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: sra $2, $1, 16
+;
; GP64R0R1-LABEL: sdiv_i16:
; GP64R0R1: # %bb.0: # %entry
; GP64R0R1-NEXT: div $zero, $4, $5
@@ -206,6 +252,14 @@ entry:
}
define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) {
+; MIPS2-LABEL: sdiv_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: sdiv_i32:
; GP32: # %bb.0: # %entry
; GP32-NEXT: div $zero, $4, $5
@@ -219,6 +273,14 @@ define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: sdiv_i32:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: div $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: sdiv_i32:
; GP64: # %bb.0: # %entry
; GP64-NEXT: div $zero, $4, $5
@@ -250,6 +312,22 @@ entry:
}
define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) {
+; MIPS2-LABEL: sdiv_i64:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -24
+; MIPS2-NEXT: .cfi_def_cfa_offset 24
+; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $25, %call16(__divdi3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 24
+;
; GP32-LABEL: sdiv_i64:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -281,6 +359,14 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 24
;
+; MIPS3-LABEL: sdiv_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddiv $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: sdiv_i64:
; GP64: # %bb.0: # %entry
; GP64-NEXT: ddiv $zero, $4, $5
@@ -332,6 +418,30 @@ entry:
}
define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) {
+; MIPS2-LABEL: sdiv_i128:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -40
+; MIPS2-NEXT: .cfi_def_cfa_offset 40
+; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $1, 60($sp)
+; MIPS2-NEXT: lw $2, 64($sp)
+; MIPS2-NEXT: lw $3, 68($sp)
+; MIPS2-NEXT: sw $3, 28($sp)
+; MIPS2-NEXT: sw $2, 24($sp)
+; MIPS2-NEXT: sw $1, 20($sp)
+; MIPS2-NEXT: lw $1, 56($sp)
+; MIPS2-NEXT: sw $1, 16($sp)
+; MIPS2-NEXT: lw $25, %call16(__divti3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 40
+;
; GP32-LABEL: sdiv_i128:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -379,6 +489,25 @@ define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 40
;
+; MIPS3-LABEL: sdiv_i128:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: daddiu $sp, $sp, -16
+; MIPS3-NEXT: .cfi_def_cfa_offset 16
+; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: .cfi_offset 31, -8
+; MIPS3-NEXT: .cfi_offset 28, -16
+; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(sdiv_i128)))
+; MIPS3-NEXT: daddu $1, $1, $25
+; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(sdiv_i128)))
+; MIPS3-NEXT: ld $25, %call16(__divti3)($gp)
+; MIPS3-NEXT: jalr $25
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: daddiu $sp, $sp, 16
+;
; GP64-LABEL: sdiv_i128:
; GP64: # %bb.0: # %entry
; GP64-NEXT: daddiu $sp, $sp, -16
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
index 81f089a52947..394890a9dcc7 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -440,49 +440,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS2: # %bb.0: # %entry
; MIPS2-NEXT: addiu $sp, $sp, -32
; MIPS2-NEXT: .cfi_def_cfa_offset 32
-; MIPS2-NEXT: swl $zero, 28($sp)
-; MIPS2-NEXT: swl $zero, 24($sp)
-; MIPS2-NEXT: swl $zero, 20($sp)
-; MIPS2-NEXT: swl $zero, 16($sp)
-; MIPS2-NEXT: swl $7, 12($sp)
-; MIPS2-NEXT: swl $6, 8($sp)
-; MIPS2-NEXT: swl $5, 4($sp)
-; MIPS2-NEXT: swl $4, 0($sp)
-; MIPS2-NEXT: swr $zero, 31($sp)
-; MIPS2-NEXT: swr $zero, 27($sp)
-; MIPS2-NEXT: swr $zero, 23($sp)
-; MIPS2-NEXT: swr $zero, 19($sp)
-; MIPS2-NEXT: swr $7, 15($sp)
-; MIPS2-NEXT: swr $6, 11($sp)
-; MIPS2-NEXT: swr $5, 7($sp)
-; MIPS2-NEXT: swr $4, 3($sp)
; MIPS2-NEXT: lw $1, 60($sp)
; MIPS2-NEXT: srl $2, $1, 3
-; MIPS2-NEXT: andi $2, $2, 15
+; MIPS2-NEXT: sw $7, 12($sp)
+; MIPS2-NEXT: sw $6, 8($sp)
+; MIPS2-NEXT: sw $5, 4($sp)
+; MIPS2-NEXT: sw $4, 0($sp)
+; MIPS2-NEXT: andi $2, $2, 12
; MIPS2-NEXT: addiu $3, $sp, 0
; MIPS2-NEXT: addu $4, $3, $2
-; MIPS2-NEXT: lwl $5, 8($4)
-; MIPS2-NEXT: lwr $5, 11($4)
-; MIPS2-NEXT: srl $2, $5, 1
-; MIPS2-NEXT: lwl $3, 4($4)
-; MIPS2-NEXT: lwr $3, 7($4)
-; MIPS2-NEXT: andi $1, $1, 7
-; MIPS2-NEXT: not $6, $1
-; MIPS2-NEXT: sllv $7, $3, $1
-; MIPS2-NEXT: srlv $6, $2, $6
-; MIPS2-NEXT: lwl $2, 0($4)
-; MIPS2-NEXT: lwr $2, 3($4)
-; MIPS2-NEXT: sllv $2, $2, $1
-; MIPS2-NEXT: srl $3, $3, 1
-; MIPS2-NEXT: xori $8, $1, 31
-; MIPS2-NEXT: srlv $3, $3, $8
-; MIPS2-NEXT: or $2, $2, $3
-; MIPS2-NEXT: or $3, $7, $6
+; MIPS2-NEXT: sw $zero, 28($sp)
+; MIPS2-NEXT: sw $zero, 24($sp)
+; MIPS2-NEXT: sw $zero, 20($sp)
+; MIPS2-NEXT: sw $zero, 16($sp)
+; MIPS2-NEXT: lw $5, 8($4)
+; MIPS2-NEXT: lw $2, 4($4)
+; MIPS2-NEXT: sllv $3, $2, $1
+; MIPS2-NEXT: srl $6, $5, 1
+; MIPS2-NEXT: andi $7, $1, 31
+; MIPS2-NEXT: xori $7, $7, 31
+; MIPS2-NEXT: srlv $6, $6, $7
+; MIPS2-NEXT: lw $8, 0($4)
+; MIPS2-NEXT: sllv $8, $8, $1
+; MIPS2-NEXT: srl $2, $2, 1
+; MIPS2-NEXT: srlv $2, $2, $7
+; MIPS2-NEXT: or $2, $8, $2
+; MIPS2-NEXT: or $3, $3, $6
; MIPS2-NEXT: sllv $5, $5, $1
-; MIPS2-NEXT: lwl $6, 12($4)
-; MIPS2-NEXT: lwr $6, 15($4)
+; MIPS2-NEXT: lw $6, 12($4)
; MIPS2-NEXT: srl $4, $6, 1
-; MIPS2-NEXT: srlv $4, $4, $8
+; MIPS2-NEXT: srlv $4, $4, $7
; MIPS2-NEXT: or $4, $5, $4
; MIPS2-NEXT: sllv $5, $6, $1
; MIPS2-NEXT: jr $ra
@@ -492,49 +479,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32: # %bb.0: # %entry
; MIPS32-NEXT: addiu $sp, $sp, -32
; MIPS32-NEXT: .cfi_def_cfa_offset 32
-; MIPS32-NEXT: swl $zero, 28($sp)
-; MIPS32-NEXT: swl $zero, 24($sp)
-; MIPS32-NEXT: swl $zero, 20($sp)
-; MIPS32-NEXT: swl $zero, 16($sp)
-; MIPS32-NEXT: swl $7, 12($sp)
-; MIPS32-NEXT: swl $6, 8($sp)
-; MIPS32-NEXT: swl $5, 4($sp)
-; MIPS32-NEXT: swl $4, 0($sp)
-; MIPS32-NEXT: swr $zero, 31($sp)
-; MIPS32-NEXT: swr $zero, 27($sp)
-; MIPS32-NEXT: swr $zero, 23($sp)
-; MIPS32-NEXT: swr $zero, 19($sp)
-; MIPS32-NEXT: swr $7, 15($sp)
-; MIPS32-NEXT: swr $6, 11($sp)
-; MIPS32-NEXT: swr $5, 7($sp)
-; MIPS32-NEXT: swr $4, 3($sp)
; MIPS32-NEXT: lw $1, 60($sp)
; MIPS32-NEXT: srl $2, $1, 3
-; MIPS32-NEXT: andi $2, $2, 15
+; MIPS32-NEXT: sw $7, 12($sp)
+; MIPS32-NEXT: sw $6, 8($sp)
+; MIPS32-NEXT: sw $5, 4($sp)
+; MIPS32-NEXT: sw $4, 0($sp)
+; MIPS32-NEXT: andi $2, $2, 12
; MIPS32-NEXT: addiu $3, $sp, 0
; MIPS32-NEXT: addu $4, $3, $2
-; MIPS32-NEXT: lwl $5, 8($4)
-; MIPS32-NEXT: lwr $5, 11($4)
-; MIPS32-NEXT: srl $2, $5, 1
-; MIPS32-NEXT: lwl $3, 4($4)
-; MIPS32-NEXT: lwr $3, 7($4)
-; MIPS32-NEXT: andi $1, $1, 7
-; MIPS32-NEXT: not $6, $1
-; MIPS32-NEXT: sllv $7, $3, $1
-; MIPS32-NEXT: srlv $6, $2, $6
-; MIPS32-NEXT: lwl $2, 0($4)
-; MIPS32-NEXT: lwr $2, 3($4)
-; MIPS32-NEXT: sllv $2, $2, $1
-; MIPS32-NEXT: srl $3, $3, 1
-; MIPS32-NEXT: xori $8, $1, 31
-; MIPS32-NEXT: srlv $3, $3, $8
-; MIPS32-NEXT: or $2, $2, $3
-; MIPS32-NEXT: or $3, $7, $6
+; MIPS32-NEXT: sw $zero, 28($sp)
+; MIPS32-NEXT: sw $zero, 24($sp)
+; MIPS32-NEXT: sw $zero, 20($sp)
+; MIPS32-NEXT: sw $zero, 16($sp)
+; MIPS32-NEXT: lw $5, 8($4)
+; MIPS32-NEXT: lw $2, 4($4)
+; MIPS32-NEXT: sllv $3, $2, $1
+; MIPS32-NEXT: srl $6, $5, 1
+; MIPS32-NEXT: andi $7, $1, 31
+; MIPS32-NEXT: xori $7, $7, 31
+; MIPS32-NEXT: srlv $6, $6, $7
+; MIPS32-NEXT: lw $8, 0($4)
+; MIPS32-NEXT: sllv $8, $8, $1
+; MIPS32-NEXT: srl $2, $2, 1
+; MIPS32-NEXT: srlv $2, $2, $7
+; MIPS32-NEXT: or $2, $8, $2
+; MIPS32-NEXT: or $3, $3, $6
; MIPS32-NEXT: sllv $5, $5, $1
-; MIPS32-NEXT: lwl $6, 12($4)
-; MIPS32-NEXT: lwr $6, 15($4)
+; MIPS32-NEXT: lw $6, 12($4)
; MIPS32-NEXT: srl $4, $6, 1
-; MIPS32-NEXT: srlv $4, $4, $8
+; MIPS32-NEXT: srlv $4, $4, $7
; MIPS32-NEXT: or $4, $5, $4
; MIPS32-NEXT: sllv $5, $6, $1
; MIPS32-NEXT: jr $ra
@@ -544,48 +518,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R2: # %bb.0: # %entry
; MIPS32R2-NEXT: addiu $sp, $sp, -32
; MIPS32R2-NEXT: .cfi_def_cfa_offset 32
-; MIPS32R2-NEXT: swl $zero, 28($sp)
-; MIPS32R2-NEXT: swl $zero, 24($sp)
-; MIPS32R2-NEXT: swl $zero, 20($sp)
-; MIPS32R2-NEXT: swl $zero, 16($sp)
-; MIPS32R2-NEXT: swl $7, 12($sp)
-; MIPS32R2-NEXT: swl $6, 8($sp)
-; MIPS32R2-NEXT: swl $5, 4($sp)
-; MIPS32R2-NEXT: swl $4, 0($sp)
-; MIPS32R2-NEXT: swr $zero, 31($sp)
-; MIPS32R2-NEXT: swr $zero, 27($sp)
-; MIPS32R2-NEXT: swr $zero, 23($sp)
-; MIPS32R2-NEXT: swr $zero, 19($sp)
-; MIPS32R2-NEXT: swr $7, 15($sp)
-; MIPS32R2-NEXT: swr $6, 11($sp)
-; MIPS32R2-NEXT: swr $5, 7($sp)
-; MIPS32R2-NEXT: swr $4, 3($sp)
; MIPS32R2-NEXT: lw $1, 60($sp)
-; MIPS32R2-NEXT: ext $2, $1, 3, 4
+; MIPS32R2-NEXT: srl $2, $1, 3
+; MIPS32R2-NEXT: sw $7, 12($sp)
+; MIPS32R2-NEXT: sw $6, 8($sp)
+; MIPS32R2-NEXT: sw $5, 4($sp)
+; MIPS32R2-NEXT: sw $4, 0($sp)
+; MIPS32R2-NEXT: andi $2, $2, 12
; MIPS32R2-NEXT: addiu $3, $sp, 0
; MIPS32R2-NEXT: addu $4, $3, $2
-; MIPS32R2-NEXT: lwl $5, 8($4)
-; MIPS32R2-NEXT: lwr $5, 11($4)
-; MIPS32R2-NEXT: srl $2, $5, 1
-; MIPS32R2-NEXT: lwl $3, 4($4)
-; MIPS32R2-NEXT: lwr $3, 7($4)
-; MIPS32R2-NEXT: andi $1, $1, 7
-; MIPS32R2-NEXT: not $6, $1
-; MIPS32R2-NEXT: sllv $7, $3, $1
-; MIPS32R2-NEXT: srlv $6, $2, $6
-; MIPS32R2-NEXT: lwl $2, 0($4)
-; MIPS32R2-NEXT: lwr $2, 3($4)
-; MIPS32R2-NEXT: sllv $2, $2, $1
-; MIPS32R2-NEXT: srl $3, $3, 1
-; MIPS32R2-NEXT: xori $8, $1, 31
-; MIPS32R2-NEXT: srlv $3, $3, $8
-; MIPS32R2-NEXT: or $2, $2, $3
-; MIPS32R2-NEXT: or $3, $7, $6
+; MIPS32R2-NEXT: sw $zero, 28($sp)
+; MIPS32R2-NEXT: sw $zero, 24($sp)
+; MIPS32R2-NEXT: sw $zero, 20($sp)
+; MIPS32R2-NEXT: sw $zero, 16($sp)
+; MIPS32R2-NEXT: lw $5, 8($4)
+; MIPS32R2-NEXT: lw $2, 4($4)
+; MIPS32R2-NEXT: sllv $3, $2, $1
+; MIPS32R2-NEXT: srl $6, $5, 1
+; MIPS32R2-NEXT: andi $7, $1, 31
+; MIPS32R2-NEXT: xori $7, $7, 31
+; MIPS32R2-NEXT: srlv $6, $6, $7
+; MIPS32R2-NEXT: lw $8, 0($4)
+; MIPS32R2-NEXT: sllv $8, $8, $1
+; MIPS32R2-NEXT: srl $2, $2, 1
+; MIPS32R2-NEXT: srlv $2, $2, $7
+; MIPS32R2-NEXT: or $2, $8, $2
+; MIPS32R2-NEXT: or $3, $3, $6
; MIPS32R2-NEXT: sllv $5, $5, $1
-; MIPS32R2-NEXT: lwl $6, 12($4)
-; MIPS32R2-NEXT: lwr $6, 15($4)
+; MIPS32R2-NEXT: lw $6, 12($4)
; MIPS32R2-NEXT: srl $4, $6, 1
-; MIPS32R2-NEXT: srlv $4, $4, $8
+; MIPS32R2-NEXT: srlv $4, $4, $7
; MIPS32R2-NEXT: or $4, $5, $4
; MIPS32R2-NEXT: sllv $5, $6, $1
; MIPS32R2-NEXT: jr $ra
@@ -596,11 +558,12 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: addiu $sp, $sp, -32
; MIPS32R6-NEXT: .cfi_def_cfa_offset 32
; MIPS32R6-NEXT: lw $1, 60($sp)
+; MIPS32R6-NEXT: srl $2, $1, 3
; MIPS32R6-NEXT: sw $7, 12($sp)
; MIPS32R6-NEXT: sw $6, 8($sp)
; MIPS32R6-NEXT: sw $5, 4($sp)
; MIPS32R6-NEXT: sw $4, 0($sp)
-; MIPS32R6-NEXT: ext $2, $1, 3, 4
+; MIPS32R6-NEXT: andi $2, $2, 12
; MIPS32R6-NEXT: addiu $3, $sp, 0
; MIPS32R6-NEXT: addu $4, $3, $2
; MIPS32R6-NEXT: sw $zero, 28($sp)
@@ -608,23 +571,22 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MIPS32R6-NEXT: sw $zero, 20($sp)
; MIPS32R6-NEXT: sw $zero, 16($sp)
; MIPS32R6-NEXT: lw $5, 8($4)
-; MIPS32R6-NEXT: srl $2, $5, 1
-; MIPS32R6-NEXT: lw $3, 4($4)
-; MIPS32R6-NEXT: andi $1, $1, 7
-; MIPS32R6-NEXT: not $6, $1
-; MIPS32R6-NEXT: sllv $7, $3, $1
-; MIPS32R6-NEXT: srlv $6, $2, $6
-; MIPS32R6-NEXT: lw $2, 0($4)
-; MIPS32R6-NEXT: sllv $2, $2, $1
-; MIPS32R6-NEXT: srl $3, $3, 1
-; MIPS32R6-NEXT: xori $8, $1, 31
-; MIPS32R6-NEXT: srlv $3, $3, $8
-; MIPS32R6-NEXT: or $2, $2, $3
-; MIPS32R6-NEXT: or $3, $7, $6
+; MIPS32R6-NEXT: lw $2, 4($4)
+; MIPS32R6-NEXT: sllv $3, $2, $1
+; MIPS32R6-NEXT: srl $6, $5, 1
+; MIPS32R6-NEXT: andi $7, $1, 31
+; MIPS32R6-NEXT: xori $7, $7, 31
+; MIPS32R6-NEXT: srlv $6, $6, $7
+; MIPS32R6-NEXT: lw $8, 0($4)
+; MIPS32R6-NEXT: sllv $8, $8, $1
+; MIPS32R6-NEXT: srl $2, $2, 1
+; MIPS32R6-NEXT: srlv $2, $2, $7
+; MIPS32R6-NEXT: or $2, $8, $2
+; MIPS32R6-NEXT: or $3, $3, $6
; MIPS32R6-NEXT: sllv $5, $5, $1
; MIPS32R6-NEXT: lw $6, 12($4)
; MIPS32R6-NEXT: srl $4, $6, 1
-; MIPS32R6-NEXT: srlv $4, $4, $8
+; MIPS32R6-NEXT: srlv $4, $4, $7
; MIPS32R6-NEXT: or $4, $5, $4
; MIPS32R6-NEXT: sllv $5, $6, $1
; MIPS32R6-NEXT: jr $ra
@@ -722,47 +684,32 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR3-NEXT: .cfi_offset 17, -4
; MMR3-NEXT: .cfi_offset 16, -8
; MMR3-NEXT: li16 $2, 0
-; MMR3-NEXT: swl $2, 28($sp)
-; MMR3-NEXT: swl $2, 24($sp)
-; MMR3-NEXT: swl $2, 20($sp)
-; MMR3-NEXT: swl $2, 16($sp)
-; MMR3-NEXT: swl $7, 12($sp)
-; MMR3-NEXT: swl $6, 8($sp)
-; MMR3-NEXT: swl $5, 4($sp)
-; MMR3-NEXT: swl $4, 0($sp)
-; MMR3-NEXT: swr $2, 31($sp)
-; MMR3-NEXT: swr $2, 27($sp)
-; MMR3-NEXT: swr $2, 23($sp)
-; MMR3-NEXT: swr $2, 19($sp)
-; MMR3-NEXT: swr $7, 15($sp)
-; MMR3-NEXT: swr $6, 11($sp)
-; MMR3-NEXT: swr $5, 7($sp)
-; MMR3-NEXT: swr $4, 3($sp)
+; MMR3-NEXT: sw $2, 28($sp)
+; MMR3-NEXT: sw $2, 24($sp)
+; MMR3-NEXT: sw $2, 20($sp)
+; MMR3-NEXT: sw $2, 16($sp)
+; MMR3-NEXT: swp $6, 8($sp)
+; MMR3-NEXT: swp $4, 0($sp)
; MMR3-NEXT: lw $2, 68($sp)
-; MMR3-NEXT: ext $3, $2, 3, 4
+; MMR3-NEXT: srl16 $3, $2, 3
+; MMR3-NEXT: andi $3, $3, 12
; MMR3-NEXT: addiur1sp $4, 0
; MMR3-NEXT: addu16 $4, $4, $3
-; MMR3-NEXT: lwl $6, 8($4)
-; MMR3-NEXT: lwr $6, 11($4)
-; MMR3-NEXT: srl16 $3, $6, 1
-; MMR3-NEXT: lwl $7, 4($4)
-; MMR3-NEXT: lwr $7, 7($4)
-; MMR3-NEXT: andi16 $5, $2, 7
-; MMR3-NEXT: not16 $2, $5
-; MMR3-NEXT: andi16 $2, $2, 31
+; MMR3-NEXT: lw16 $6, 8($4)
+; MMR3-NEXT: lw16 $7, 4($4)
+; MMR3-NEXT: andi16 $5, $2, 31
; MMR3-NEXT: sllv $16, $7, $5
-; MMR3-NEXT: srlv $3, $3, $2
-; MMR3-NEXT: lwl $1, 0($4)
-; MMR3-NEXT: lwr $1, 3($4)
-; MMR3-NEXT: sllv $17, $1, $5
-; MMR3-NEXT: srl16 $2, $7, 1
+; MMR3-NEXT: srl16 $2, $6, 1
; MMR3-NEXT: xori $1, $5, 31
+; MMR3-NEXT: srlv $3, $2, $1
+; MMR3-NEXT: lw16 $2, 0($4)
+; MMR3-NEXT: sllv $17, $2, $5
+; MMR3-NEXT: srl16 $2, $7, 1
; MMR3-NEXT: srlv $2, $2, $1
; MMR3-NEXT: or16 $2, $17
; MMR3-NEXT: or16 $3, $16
; MMR3-NEXT: sllv $6, $6, $5
-; MMR3-NEXT: lwl $7, 12($4)
-; MMR3-NEXT: lwr $7, 15($4)
+; MMR3-NEXT: lw16 $7, 12($4)
; MMR3-NEXT: srl16 $4, $7, 1
; MMR3-NEXT: srlv $4, $4, $1
; MMR3-NEXT: or16 $4, $6
@@ -785,30 +732,29 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) {
; MMR6-NEXT: sw $5, 4($sp)
; MMR6-NEXT: sw $4, 0($sp)
; MMR6-NEXT: lw $2, 60($sp)
-; MMR6-NEXT: ext $3, $2, 3, 4
+; MMR6-NEXT: srl16 $3, $2, 3
+; MMR6-NEXT: andi $3, $3, 12
; MMR6-NEXT: addiu $4, $sp, 0
; MMR6-NEXT: addu16 $4, $4, $3
-; MMR6-NEXT: lw16 $6, 8($4)
-; MMR6-NEXT: srl16 $3, $6, 1
-; MMR6-NEXT: lw16 $7, 4($4)
-; MMR6-NEXT: andi16 $5, $2, 7
-; MMR6-NEXT: not16 $2, $5
-; MMR6-NEXT: andi16 $2, $2, 31
-; MMR6-NEXT: sllv $1, $7, $5
-; MMR6-NEXT: srlv $3, $3, $2
+; MMR6-NEXT: lw16 $5, 8($4)
+; MMR6-NEXT: lw16 $3, 4($4)
+; MMR6-NEXT: andi16 $6, $2, 31
+; MMR6-NEXT: sllv $1, $3, $6
+; MMR6-NEXT: srl16 $2, $5, 1
+; MMR6-NEXT: xori $7, $6, 31
+; MMR6-NEXT: srlv $8, $2, $7
; MMR6-NEXT: lw16 $2, 0($4)
-; MMR6-NEXT: sllv $2, $2, $5
-; MMR6-NEXT: srl16 $7, $7, 1
-; MMR6-NEXT: xori $8, $5, 31
-; MMR6-NEXT: srlv $7, $7, $8
-; MMR6-NEXT: or $2, $2, $7
-; MMR6-NEXT: or $3, $1, $3
-; MMR6-NEXT: sllv $1, $6, $5
-; MMR6-NEXT: lw16 $6, 12($4)
-; MMR6-NEXT: srl16 $4, $6, 1
-; MMR6-NEXT: srlv $4, $4, $8
+; MMR6-NEXT: sllv $2, $2, $6
+; MMR6-NEXT: srl16 $3, $3, 1
+; MMR6-NEXT: srlv $3, $3, $7
+; MMR6-NEXT: or $2, $2, $3
+; MMR6-NEXT: or $3, $1, $8
+; MMR6-NEXT: sllv $1, $5, $6
+; MMR6-NEXT: lw16 $5, 12($4)
+; MMR6-NEXT: srl16 $4, $5, 1
+; MMR6-NEXT: srlv $4, $4, $7
; MMR6-NEXT: or $4, $1, $4
-; MMR6-NEXT: sllv $5, $6, $5
+; MMR6-NEXT: sllv $5, $5, $6
; MMR6-NEXT: addiu $sp, $sp, 32
; MMR6-NEXT: jrc $ra
entry:
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll
index 6349d5c64ab4..29cb34b8d970 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32
; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \
@@ -13,9 +13,9 @@
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6
; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \
@@ -35,6 +35,11 @@
; RUN: FileCheck %s -check-prefix=MMR6
define signext i1 @srem_i1(i1 signext %a, i1 signext %b) {
+; MIPS2-LABEL: srem_i1:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $2, $zero, 0
+;
; GP32-LABEL: srem_i1:
; GP32: # %bb.0: # %entry
; GP32-NEXT: jr $ra
@@ -45,6 +50,11 @@ define signext i1 @srem_i1(i1 signext %a, i1 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $2, $zero, 0
;
+; MIPS3-LABEL: srem_i1:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: addiu $2, $zero, 0
+;
; GP64-LABEL: srem_i1:
; GP64: # %bb.0: # %entry
; GP64-NEXT: jr $ra
@@ -70,6 +80,14 @@ entry:
}
define signext i8 @srem_i8(i8 signext %a, i8 signext %b) {
+; MIPS2-LABEL: srem_i8:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mfhi $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: srem_i8:
; GP32: # %bb.0: # %entry
; GP32-NEXT: div $zero, $4, $5
@@ -83,6 +101,14 @@ define signext i8 @srem_i8(i8 signext %a, i8 signext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: srem_i8:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: div $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: srem_i8:
; GP64: # %bb.0: # %entry
; GP64-NEXT: div $zero, $4, $5
@@ -114,6 +140,14 @@ entry:
}
define signext i16 @srem_i16(i16 signext %a, i16 signext %b) {
+; MIPS2-LABEL: srem_i16:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mfhi $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: srem_i16:
; GP32: # %bb.0: # %entry
; GP32-NEXT: div $zero, $4, $5
@@ -127,6 +161,14 @@ define signext i16 @srem_i16(i16 signext %a, i16 signext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: srem_i16:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: div $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: srem_i16:
; GP64: # %bb.0: # %entry
; GP64-NEXT: div $zero, $4, $5
@@ -158,6 +200,14 @@ entry:
}
define signext i32 @srem_i32(i32 signext %a, i32 signext %b) {
+; MIPS2-LABEL: srem_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mfhi $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: srem_i32:
; GP32: # %bb.0: # %entry
; GP32-NEXT: div $zero, $4, $5
@@ -171,6 +221,14 @@ define signext i32 @srem_i32(i32 signext %a, i32 signext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: srem_i32:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: div $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: srem_i32:
; GP64: # %bb.0: # %entry
; GP64-NEXT: div $zero, $4, $5
@@ -202,6 +260,22 @@ entry:
}
define signext i64 @srem_i64(i64 signext %a, i64 signext %b) {
+; MIPS2-LABEL: srem_i64:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -24
+; MIPS2-NEXT: .cfi_def_cfa_offset 24
+; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $25, %call16(__moddi3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 24
+;
; GP32-LABEL: srem_i64:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -233,6 +307,14 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 24
;
+; MIPS3-LABEL: srem_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddiv $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: srem_i64:
; GP64: # %bb.0: # %entry
; GP64-NEXT: ddiv $zero, $4, $5
@@ -284,6 +366,30 @@ entry:
}
define signext i128 @srem_i128(i128 signext %a, i128 signext %b) {
+; MIPS2-LABEL: srem_i128:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -40
+; MIPS2-NEXT: .cfi_def_cfa_offset 40
+; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $1, 60($sp)
+; MIPS2-NEXT: lw $2, 64($sp)
+; MIPS2-NEXT: lw $3, 68($sp)
+; MIPS2-NEXT: sw $3, 28($sp)
+; MIPS2-NEXT: sw $2, 24($sp)
+; MIPS2-NEXT: sw $1, 20($sp)
+; MIPS2-NEXT: lw $1, 56($sp)
+; MIPS2-NEXT: sw $1, 16($sp)
+; MIPS2-NEXT: lw $25, %call16(__modti3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 40
+;
; GP32-LABEL: srem_i128:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -331,6 +437,25 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 40
;
+; MIPS3-LABEL: srem_i128:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: daddiu $sp, $sp, -16
+; MIPS3-NEXT: .cfi_def_cfa_offset 16
+; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: .cfi_offset 31, -8
+; MIPS3-NEXT: .cfi_offset 28, -16
+; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(srem_i128)))
+; MIPS3-NEXT: daddu $1, $1, $25
+; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(srem_i128)))
+; MIPS3-NEXT: ld $25, %call16(__modti3)($gp)
+; MIPS3-NEXT: jalr $25
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: daddiu $sp, $sp, 16
+;
; GP64-LABEL: srem_i128:
; GP64: # %bb.0: # %entry
; GP64-NEXT: daddiu $sp, $sp, -16
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll
new file mode 100644
index 000000000000..db2c660e9bc7
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32
+
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -O3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -O3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64
+
+define signext i32 @mult_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-LABEL: mult_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: mult $4, $5
+; MIPS2-NEXT: mflo $1
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: mult $1, $6
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
+; MIPS32-LABEL: mult_i32:
+; MIPS32: # %bb.0: # %entry
+; MIPS32-NEXT: mul $1, $4, $5
+; MIPS32-NEXT: jr $ra
+; MIPS32-NEXT: mul $2, $1, $6
+;
+entry:
+ %mul = mul nsw i32 %a, %b
+ %mul1 = mul nsw i32 %mul, %c
+ ret i32 %mul1
+}
+
+define signext i64 @mul_i64(i64 signext %a, i64 signext %b, i64 signext %c) {
+; MIPS3-LABEL: mul_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: dmult $4, $5
+; MIPS3-NEXT: mflo $1
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: dmult $1, $6
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
+; MIPS64-LABEL: mul_i64:
+; MIPS64: # %bb.0: # %entry
+; MIPS64-NEXT: dmult $4, $5
+; MIPS64-NEXT: mflo $1
+; MIPS64-NEXT: dmult $1, $6
+; MIPS64-NEXT: jr $ra
+; MIPS64-NEXT: mflo $2
+;
+entry:
+ %mul = mul i64 %a, %b
+ %mul1 = mul i64 %mul, %c
+ ret i64 %mul1
+}
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll
new file mode 100644
index 000000000000..4ec5ecc9e2f1
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32
+
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64
+
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0
+
+define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-LABEL: sdiv_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $1
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: div $zero, $1, $6
+; MIPS2-NEXT: teq $6, $zero, 7
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
+; MIPS32-LABEL: sdiv_i32:
+; MIPS32: # %bb.0: # %entry
+; MIPS32-NEXT: div $zero, $4, $5
+; MIPS32-NEXT: teq $5, $zero, 7
+; MIPS32-NEXT: mflo $1
+; MIPS32-NEXT: div $zero, $1, $6
+; MIPS32-NEXT: teq $6, $zero, 7
+; MIPS32-NEXT: jr $ra
+; MIPS32-NEXT: mflo $2
+;
+entry:
+ %sdiv = sdiv i32 %a, %b
+ %sdiv1 = sdiv i32 %sdiv, %c
+ ret i32 %sdiv1
+}
+
+define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b, i64 signext %c) {
+; MIPS3-LABEL: sdiv_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddiv $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $1
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ddiv $zero, $1, $6
+; MIPS3-NEXT: teq $6, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
+; MIPS64-LABEL: sdiv_i64:
+; MIPS64: # %bb.0: # %entry
+; MIPS64-NEXT: ddiv $zero, $4, $5
+; MIPS64-NEXT: teq $5, $zero, 7
+; MIPS64-NEXT: mflo $1
+; MIPS64-NEXT: ddiv $zero, $1, $6
+; MIPS64-NEXT: teq $6, $zero, 7
+; MIPS64-NEXT: jr $ra
+; MIPS64-NEXT: mflo $2
+;
+entry:
+ %sdiv = sdiv i64 %a, %b
+ %sdiv1 = sdiv i64 %sdiv, %c
+ ret i64 %sdiv1
+}
+
+define signext i32 @sdiv_lw_sdiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-O0-LABEL: sdiv_lw_sdiv_i32:
+; MIPS2-O0: # %bb.0: # %entry
+; MIPS2-O0-NEXT: addiu $sp, $sp, -16
+; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS2-O0-NEXT: sw $4, 12($sp)
+; MIPS2-O0-NEXT: sw $5, 8($sp)
+; MIPS2-O0-NEXT: sw $6, 4($sp)
+; MIPS2-O0-NEXT: lw $2, 12($sp)
+; MIPS2-O0-NEXT: lw $1, 8($sp)
+; MIPS2-O0-NEXT: div $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mflo $2
+; MIPS2-O0-NEXT: lw $1, 4($sp)
+; MIPS2-O0-NEXT: nop
+; MIPS2-O0-NEXT: div $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mflo $2
+; MIPS2-O0-NEXT: addiu $sp, $sp, 16
+; MIPS2-O0-NEXT: jr $ra
+; MIPS2-O0-NEXT: nop
+;
+; MIPS32-O0-LABEL: sdiv_lw_sdiv_i32:
+; MIPS32-O0: # %bb.0: # %entry
+; MIPS32-O0-NEXT: addiu $sp, $sp, -16
+; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS32-O0-NEXT: sw $4, 12($sp)
+; MIPS32-O0-NEXT: sw $5, 8($sp)
+; MIPS32-O0-NEXT: sw $6, 4($sp)
+; MIPS32-O0-NEXT: lw $2, 12($sp)
+; MIPS32-O0-NEXT: lw $1, 8($sp)
+; MIPS32-O0-NEXT: div $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mflo $2
+; MIPS32-O0-NEXT: lw $1, 4($sp)
+; MIPS32-O0-NEXT: div $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mflo $2
+; MIPS32-O0-NEXT: addiu $sp, $sp, 16
+; MIPS32-O0-NEXT: jr $ra
+; MIPS32-O0-NEXT: nop
+;
+entry:
+ %a.addr = alloca i32, align 4
+ %b.addr = alloca i32, align 4
+ %c.addr = alloca i32, align 4
+ store i32 %a, ptr %a.addr, align 4
+ store i32 %b, ptr %b.addr, align 4
+ store i32 %c, ptr %c.addr, align 4
+ %0 = load i32, ptr %a.addr, align 4
+ %1 = load i32, ptr %b.addr, align 4
+ %sdiv = sdiv i32 %0, %1
+ %2 = load i32, ptr %c.addr, align 4
+ %sdiv1 = sdiv i32 %sdiv, %2
+ ret i32 %sdiv1
+}
+
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll
new file mode 100644
index 000000000000..4f729b015b28
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32
+
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64
+
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0
+
+define signext i32 @srem_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-LABEL: srem_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: div $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mfhi $1
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: div $zero, $1, $6
+; MIPS2-NEXT: teq $6, $zero, 7
+; MIPS2-NEXT: mfhi $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
+; MIPS32-LABEL: srem_i32:
+; MIPS32: # %bb.0: # %entry
+; MIPS32-NEXT: div $zero, $4, $5
+; MIPS32-NEXT: teq $5, $zero, 7
+; MIPS32-NEXT: mfhi $1
+; MIPS32-NEXT: div $zero, $1, $6
+; MIPS32-NEXT: teq $6, $zero, 7
+; MIPS32-NEXT: jr $ra
+; MIPS32-NEXT: mfhi $2
+;
+entry:
+ %rem = srem i32 %a, %b
+ %rem1 = srem i32 %rem, %c
+ ret i32 %rem1
+}
+
+define signext i64 @srem_i64(i64 signext %a, i64 signext %b, i64 signext %c) {
+; MIPS3-LABEL: srem_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddiv $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $1
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ddiv $zero, $1, $6
+; MIPS3-NEXT: teq $6, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
+; MIPS64-LABEL: srem_i64:
+; MIPS64: # %bb.0: # %entry
+; MIPS64-NEXT: ddiv $zero, $4, $5
+; MIPS64-NEXT: teq $5, $zero, 7
+; MIPS64-NEXT: mfhi $1
+; MIPS64-NEXT: ddiv $zero, $1, $6
+; MIPS64-NEXT: teq $6, $zero, 7
+; MIPS64-NEXT: jr $ra
+; MIPS64-NEXT: mfhi $2
+;
+entry:
+ %rem = srem i64 %a, %b
+ %rem1 = srem i64 %rem, %c
+ ret i64 %rem1
+}
+
+define signext i32 @srem_lw_srem_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-O0-LABEL: srem_lw_srem_i32:
+; MIPS2-O0: # %bb.0: # %entry
+; MIPS2-O0-NEXT: addiu $sp, $sp, -16
+; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS2-O0-NEXT: sw $4, 12($sp)
+; MIPS2-O0-NEXT: sw $5, 8($sp)
+; MIPS2-O0-NEXT: sw $6, 4($sp)
+; MIPS2-O0-NEXT: lw $2, 12($sp)
+; MIPS2-O0-NEXT: lw $1, 8($sp)
+; MIPS2-O0-NEXT: div $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mfhi $2
+; MIPS2-O0-NEXT: lw $1, 4($sp)
+; MIPS2-O0-NEXT: nop
+; MIPS2-O0-NEXT: div $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mfhi $2
+; MIPS2-O0-NEXT: addiu $sp, $sp, 16
+; MIPS2-O0-NEXT: jr $ra
+; MIPS2-O0-NEXT: nop
+;
+; MIPS32-O0-LABEL: srem_lw_srem_i32:
+; MIPS32-O0: # %bb.0: # %entry
+; MIPS32-O0-NEXT: addiu $sp, $sp, -16
+; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS32-O0-NEXT: sw $4, 12($sp)
+; MIPS32-O0-NEXT: sw $5, 8($sp)
+; MIPS32-O0-NEXT: sw $6, 4($sp)
+; MIPS32-O0-NEXT: lw $2, 12($sp)
+; MIPS32-O0-NEXT: lw $1, 8($sp)
+; MIPS32-O0-NEXT: div $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mfhi $2
+; MIPS32-O0-NEXT: lw $1, 4($sp)
+; MIPS32-O0-NEXT: div $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mfhi $2
+; MIPS32-O0-NEXT: addiu $sp, $sp, 16
+; MIPS32-O0-NEXT: jr $ra
+; MIPS32-O0-NEXT: nop
+;
+entry:
+ %a.addr = alloca i32, align 4
+ %b.addr = alloca i32, align 4
+ %c.addr = alloca i32, align 4
+ store i32 %a, ptr %a.addr, align 4
+ store i32 %b, ptr %b.addr, align 4
+ store i32 %c, ptr %c.addr, align 4
+ %0 = load i32, ptr %a.addr, align 4
+ %1 = load i32, ptr %b.addr, align 4
+ %rem = srem i32 %0, %1
+ %2 = load i32, ptr %c.addr, align 4
+ %rem1 = srem i32 %rem, %2
+ ret i32 %rem1
+}
+
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll
new file mode 100644
index 000000000000..97ac0d8031cf
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32
+
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64
+
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0
+
+define signext i32 @udiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-LABEL: udiv_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: divu $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $1
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: divu $zero, $1, $6
+; MIPS2-NEXT: teq $6, $zero, 7
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
+; MIPS32-LABEL: udiv_i32:
+; MIPS32: # %bb.0: # %entry
+; MIPS32-NEXT: divu $zero, $4, $5
+; MIPS32-NEXT: teq $5, $zero, 7
+; MIPS32-NEXT: mflo $1
+; MIPS32-NEXT: divu $zero, $1, $6
+; MIPS32-NEXT: teq $6, $zero, 7
+; MIPS32-NEXT: jr $ra
+; MIPS32-NEXT: mflo $2
+;
+entry:
+ %udiv = udiv i32 %a, %b
+ %udiv1 = udiv i32 %udiv, %c
+ ret i32 %udiv1
+}
+
+define signext i64 @udiv_i64(i64 signext %a, i64 signext %b, i64 signext %c) {
+; MIPS3-LABEL: udiv_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddivu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $1
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ddivu $zero, $1, $6
+; MIPS3-NEXT: teq $6, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
+; MIPS64-LABEL: udiv_i64:
+; MIPS64: # %bb.0: # %entry
+; MIPS64-NEXT: ddivu $zero, $4, $5
+; MIPS64-NEXT: teq $5, $zero, 7
+; MIPS64-NEXT: mflo $1
+; MIPS64-NEXT: ddivu $zero, $1, $6
+; MIPS64-NEXT: teq $6, $zero, 7
+; MIPS64-NEXT: jr $ra
+; MIPS64-NEXT: mflo $2
+;
+entry:
+ %udiv = udiv i64 %a, %b
+ %udiv1 = udiv i64 %udiv, %c
+ ret i64 %udiv1
+}
+
+define signext i32 @udiv_lw_udiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-O0-LABEL: udiv_lw_udiv_i32:
+; MIPS2-O0: # %bb.0: # %entry
+; MIPS2-O0-NEXT: addiu $sp, $sp, -16
+; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS2-O0-NEXT: sw $4, 12($sp)
+; MIPS2-O0-NEXT: sw $5, 8($sp)
+; MIPS2-O0-NEXT: sw $6, 4($sp)
+; MIPS2-O0-NEXT: lw $2, 12($sp)
+; MIPS2-O0-NEXT: lw $1, 8($sp)
+; MIPS2-O0-NEXT: divu $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mflo $2
+; MIPS2-O0-NEXT: lw $1, 4($sp)
+; MIPS2-O0-NEXT: nop
+; MIPS2-O0-NEXT: divu $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mflo $2
+; MIPS2-O0-NEXT: addiu $sp, $sp, 16
+; MIPS2-O0-NEXT: jr $ra
+; MIPS2-O0-NEXT: nop
+;
+; MIPS32-O0-LABEL: udiv_lw_udiv_i32:
+; MIPS32-O0: # %bb.0: # %entry
+; MIPS32-O0-NEXT: addiu $sp, $sp, -16
+; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS32-O0-NEXT: sw $4, 12($sp)
+; MIPS32-O0-NEXT: sw $5, 8($sp)
+; MIPS32-O0-NEXT: sw $6, 4($sp)
+; MIPS32-O0-NEXT: lw $2, 12($sp)
+; MIPS32-O0-NEXT: lw $1, 8($sp)
+; MIPS32-O0-NEXT: divu $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mflo $2
+; MIPS32-O0-NEXT: lw $1, 4($sp)
+; MIPS32-O0-NEXT: divu $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mflo $2
+; MIPS32-O0-NEXT: addiu $sp, $sp, 16
+; MIPS32-O0-NEXT: jr $ra
+; MIPS32-O0-NEXT: nop
+;
+entry:
+ %a.addr = alloca i32, align 4
+ %b.addr = alloca i32, align 4
+ %c.addr = alloca i32, align 4
+ store i32 %a, ptr %a.addr, align 4
+ store i32 %b, ptr %b.addr, align 4
+ store i32 %c, ptr %c.addr, align 4
+ %0 = load i32, ptr %a.addr, align 4
+ %1 = load i32, ptr %b.addr, align 4
+ %udiv = udiv i32 %0, %1
+ %2 = load i32, ptr %c.addr, align 4
+ %udiv1 = udiv i32 %udiv, %2
+ ret i32 %udiv1
+}
+
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll
new file mode 100644
index 000000000000..e1819f1d57b7
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32
+
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
+; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64
+
+; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0
+; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0
+
+define signext i32 @urem_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-LABEL: urem_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: divu $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mfhi $1
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: divu $zero, $1, $6
+; MIPS2-NEXT: teq $6, $zero, 7
+; MIPS2-NEXT: mfhi $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
+; MIPS32-LABEL: urem_i32:
+; MIPS32: # %bb.0: # %entry
+; MIPS32-NEXT: divu $zero, $4, $5
+; MIPS32-NEXT: teq $5, $zero, 7
+; MIPS32-NEXT: mfhi $1
+; MIPS32-NEXT: divu $zero, $1, $6
+; MIPS32-NEXT: teq $6, $zero, 7
+; MIPS32-NEXT: jr $ra
+; MIPS32-NEXT: mfhi $2
+;
+entry:
+ %urem = urem i32 %a, %b
+ %urem1 = urem i32 %urem, %c
+ ret i32 %urem1
+}
+
+define signext i64 @urem_i64(i64 signext %a, i64 signext %b, i64 signext %c) {
+; MIPS3-LABEL: urem_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddivu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $1
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ddivu $zero, $1, $6
+; MIPS3-NEXT: teq $6, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
+; MIPS64-LABEL: urem_i64:
+; MIPS64: # %bb.0: # %entry
+; MIPS64-NEXT: ddivu $zero, $4, $5
+; MIPS64-NEXT: teq $5, $zero, 7
+; MIPS64-NEXT: mfhi $1
+; MIPS64-NEXT: ddivu $zero, $1, $6
+; MIPS64-NEXT: teq $6, $zero, 7
+; MIPS64-NEXT: jr $ra
+; MIPS64-NEXT: mfhi $2
+;
+entry:
+ %urem = urem i64 %a, %b
+ %urem1 = urem i64 %urem, %c
+ ret i64 %urem1
+}
+
+define signext i32 @urem_lw_urem_i32(i32 signext %a, i32 signext %b, i32 signext %c) {
+; MIPS2-O0-LABEL: urem_lw_urem_i32:
+; MIPS2-O0: # %bb.0: # %entry
+; MIPS2-O0-NEXT: addiu $sp, $sp, -16
+; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS2-O0-NEXT: sw $4, 12($sp)
+; MIPS2-O0-NEXT: sw $5, 8($sp)
+; MIPS2-O0-NEXT: sw $6, 4($sp)
+; MIPS2-O0-NEXT: lw $2, 12($sp)
+; MIPS2-O0-NEXT: lw $1, 8($sp)
+; MIPS2-O0-NEXT: divu $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mfhi $2
+; MIPS2-O0-NEXT: lw $1, 4($sp)
+; MIPS2-O0-NEXT: nop
+; MIPS2-O0-NEXT: divu $zero, $2, $1
+; MIPS2-O0-NEXT: teq $1, $zero, 7
+; MIPS2-O0-NEXT: mfhi $2
+; MIPS2-O0-NEXT: addiu $sp, $sp, 16
+; MIPS2-O0-NEXT: jr $ra
+; MIPS2-O0-NEXT: nop
+;
+; MIPS32-O0-LABEL: urem_lw_urem_i32:
+; MIPS32-O0: # %bb.0: # %entry
+; MIPS32-O0-NEXT: addiu $sp, $sp, -16
+; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16
+; MIPS32-O0-NEXT: sw $4, 12($sp)
+; MIPS32-O0-NEXT: sw $5, 8($sp)
+; MIPS32-O0-NEXT: sw $6, 4($sp)
+; MIPS32-O0-NEXT: lw $2, 12($sp)
+; MIPS32-O0-NEXT: lw $1, 8($sp)
+; MIPS32-O0-NEXT: divu $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mfhi $2
+; MIPS32-O0-NEXT: lw $1, 4($sp)
+; MIPS32-O0-NEXT: divu $zero, $2, $1
+; MIPS32-O0-NEXT: teq $1, $zero, 7
+; MIPS32-O0-NEXT: mfhi $2
+; MIPS32-O0-NEXT: addiu $sp, $sp, 16
+; MIPS32-O0-NEXT: jr $ra
+; MIPS32-O0-NEXT: nop
+;
+entry:
+ %a.addr = alloca i32, align 4
+ %b.addr = alloca i32, align 4
+ %c.addr = alloca i32, align 4
+ store i32 %a, ptr %a.addr, align 4
+ store i32 %b, ptr %b.addr, align 4
+ store i32 %c, ptr %c.addr, align 4
+ %0 = load i32, ptr %a.addr, align 4
+ %1 = load i32, ptr %b.addr, align 4
+ %rem = urem i32 %0, %1
+ %2 = load i32, ptr %c.addr, align 4
+ %urem1 = urem i32 %rem, %2
+ ret i32 %urem1
+}
+
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll
index e3dd347e723b..cc2c6614e69c 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32
; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \
@@ -13,9 +13,9 @@
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6
; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \
@@ -35,6 +35,11 @@
; RUN: FileCheck %s -check-prefix=MMR6
define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) {
+; MIPS2-LABEL: udiv_i1:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: move $2, $4
+;
; GP32-LABEL: udiv_i1:
; GP32: # %bb.0: # %entry
; GP32-NEXT: jr $ra
@@ -45,6 +50,11 @@ define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: move $2, $4
;
+; MIPS3-LABEL: udiv_i1:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: move $2, $4
+;
; GP64-LABEL: udiv_i1:
; GP64: # %bb.0: # %entry
; GP64-NEXT: jr $ra
@@ -70,6 +80,14 @@ entry:
}
define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) {
+; MIPS2-LABEL: udiv_i8:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: divu $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: udiv_i8:
; GP32: # %bb.0: # %entry
; GP32-NEXT: divu $zero, $4, $5
@@ -83,6 +101,14 @@ define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: udiv_i8:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: divu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: udiv_i8:
; GP64: # %bb.0: # %entry
; GP64-NEXT: divu $zero, $4, $5
@@ -114,6 +140,14 @@ entry:
}
define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) {
+; MIPS2-LABEL: udiv_i16:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: divu $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: udiv_i16:
; GP32: # %bb.0: # %entry
; GP32-NEXT: divu $zero, $4, $5
@@ -127,6 +161,14 @@ define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: udiv_i16:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: divu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: udiv_i16:
; GP64: # %bb.0: # %entry
; GP64-NEXT: divu $zero, $4, $5
@@ -158,6 +200,14 @@ entry:
}
define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) {
+; MIPS2-LABEL: udiv_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: divu $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mflo $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: udiv_i32:
; GP32: # %bb.0: # %entry
; GP32-NEXT: divu $zero, $4, $5
@@ -171,6 +221,14 @@ define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: udiv_i32:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: divu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: udiv_i32:
; GP64: # %bb.0: # %entry
; GP64-NEXT: divu $zero, $4, $5
@@ -202,6 +260,22 @@ entry:
}
define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) {
+; MIPS2-LABEL: udiv_i64:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -24
+; MIPS2-NEXT: .cfi_def_cfa_offset 24
+; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $25, %call16(__udivdi3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 24
+;
; GP32-LABEL: udiv_i64:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -233,6 +307,14 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 24
;
+; MIPS3-LABEL: udiv_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddivu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mflo $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: udiv_i64:
; GP64: # %bb.0: # %entry
; GP64-NEXT: ddivu $zero, $4, $5
@@ -284,6 +366,30 @@ entry:
}
define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) {
+; MIPS2-LABEL: udiv_i128:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -40
+; MIPS2-NEXT: .cfi_def_cfa_offset 40
+; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $1, 60($sp)
+; MIPS2-NEXT: lw $2, 64($sp)
+; MIPS2-NEXT: lw $3, 68($sp)
+; MIPS2-NEXT: sw $3, 28($sp)
+; MIPS2-NEXT: sw $2, 24($sp)
+; MIPS2-NEXT: sw $1, 20($sp)
+; MIPS2-NEXT: lw $1, 56($sp)
+; MIPS2-NEXT: sw $1, 16($sp)
+; MIPS2-NEXT: lw $25, %call16(__udivti3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 40
+;
; GP32-LABEL: udiv_i128:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -331,6 +437,25 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 40
;
+; MIPS3-LABEL: udiv_i128:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: daddiu $sp, $sp, -16
+; MIPS3-NEXT: .cfi_def_cfa_offset 16
+; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: .cfi_offset 31, -8
+; MIPS3-NEXT: .cfi_offset 28, -16
+; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(udiv_i128)))
+; MIPS3-NEXT: daddu $1, $1, $25
+; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(udiv_i128)))
+; MIPS3-NEXT: ld $25, %call16(__udivti3)($gp)
+; MIPS3-NEXT: jalr $25
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: daddiu $sp, $sp, 16
+;
; GP64-LABEL: udiv_i128:
; GP64: # %bb.0: # %entry
; GP64-NEXT: daddiu $sp, $sp, -16
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll
index 4105d67da6f1..5da1f614b8f1 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2
; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2
; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \
@@ -13,9 +13,9 @@
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6
; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \
-; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1
+; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \
; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1
; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \
@@ -35,6 +35,11 @@
; RUN: FileCheck %s -check-prefix=MMR6
define signext i1 @urem_i1(i1 signext %a, i1 signext %b) {
+; MIPS2-LABEL: urem_i1:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $2, $zero, 0
+;
; GP32-LABEL: urem_i1:
; GP32: # %bb.0: # %entry
; GP32-NEXT: jr $ra
@@ -45,6 +50,11 @@ define signext i1 @urem_i1(i1 signext %a, i1 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $2, $zero, 0
;
+; MIPS3-LABEL: urem_i1:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: addiu $2, $zero, 0
+;
; GP64-LABEL: urem_i1:
; GP64: # %bb.0: # %entry
; GP64-NEXT: jr $ra
@@ -70,6 +80,17 @@ entry:
}
define signext i8 @urem_i8(i8 signext %a, i8 signext %b) {
+; MIPS2-LABEL: urem_i8:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: andi $1, $5, 255
+; MIPS2-NEXT: andi $2, $4, 255
+; MIPS2-NEXT: divu $zero, $2, $1
+; MIPS2-NEXT: teq $1, $zero, 7
+; MIPS2-NEXT: mfhi $1
+; MIPS2-NEXT: sll $1, $1, 24
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: sra $2, $1, 24
+;
; GP32R0R2-LABEL: urem_i8:
; GP32R0R2: # %bb.0: # %entry
; GP32R0R2-NEXT: andi $1, $5, 255
@@ -100,6 +121,17 @@ define signext i8 @urem_i8(i8 signext %a, i8 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: seb $2, $2
;
+; MIPS3-LABEL: urem_i8:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: andi $1, $5, 255
+; MIPS3-NEXT: andi $2, $4, 255
+; MIPS3-NEXT: divu $zero, $2, $1
+; MIPS3-NEXT: teq $1, $zero, 7
+; MIPS3-NEXT: mfhi $1
+; MIPS3-NEXT: sll $1, $1, 24
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: sra $2, $1, 24
+;
; GP64R0R1-LABEL: urem_i8:
; GP64R0R1: # %bb.0: # %entry
; GP64R0R1-NEXT: andi $1, $5, 255
@@ -154,6 +186,17 @@ entry:
}
define signext i16 @urem_i16(i16 signext %a, i16 signext %b) {
+; MIPS2-LABEL: urem_i16:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: andi $1, $5, 65535
+; MIPS2-NEXT: andi $2, $4, 65535
+; MIPS2-NEXT: divu $zero, $2, $1
+; MIPS2-NEXT: teq $1, $zero, 7
+; MIPS2-NEXT: mfhi $1
+; MIPS2-NEXT: sll $1, $1, 16
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: sra $2, $1, 16
+;
; GP32R0R2-LABEL: urem_i16:
; GP32R0R2: # %bb.0: # %entry
; GP32R0R2-NEXT: andi $1, $5, 65535
@@ -184,6 +227,17 @@ define signext i16 @urem_i16(i16 signext %a, i16 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: seh $2, $2
;
+; MIPS3-LABEL: urem_i16:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: andi $1, $5, 65535
+; MIPS3-NEXT: andi $2, $4, 65535
+; MIPS3-NEXT: divu $zero, $2, $1
+; MIPS3-NEXT: teq $1, $zero, 7
+; MIPS3-NEXT: mfhi $1
+; MIPS3-NEXT: sll $1, $1, 16
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: sra $2, $1, 16
+;
; GP64R0R1-LABEL: urem_i16:
; GP64R0R1: # %bb.0: # %entry
; GP64R0R1-NEXT: andi $1, $5, 65535
@@ -238,6 +292,14 @@ entry:
}
define signext i32 @urem_i32(i32 signext %a, i32 signext %b) {
+; MIPS2-LABEL: urem_i32:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: divu $zero, $4, $5
+; MIPS2-NEXT: teq $5, $zero, 7
+; MIPS2-NEXT: mfhi $2
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: nop
+;
; GP32-LABEL: urem_i32:
; GP32: # %bb.0: # %entry
; GP32-NEXT: divu $zero, $4, $5
@@ -251,6 +313,14 @@ define signext i32 @urem_i32(i32 signext %a, i32 signext %b) {
; GP32R6-NEXT: teq $5, $zero, 7
; GP32R6-NEXT: jrc $ra
;
+; MIPS3-LABEL: urem_i32:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: divu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: urem_i32:
; GP64: # %bb.0: # %entry
; GP64-NEXT: divu $zero, $4, $5
@@ -282,6 +352,22 @@ entry:
}
define signext i64 @urem_i64(i64 signext %a, i64 signext %b) {
+; MIPS2-LABEL: urem_i64:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -24
+; MIPS2-NEXT: .cfi_def_cfa_offset 24
+; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $25, %call16(__umoddi3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 24
+;
; GP32-LABEL: urem_i64:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -313,6 +399,14 @@ define signext i64 @urem_i64(i64 signext %a, i64 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 24
;
+; MIPS3-LABEL: urem_i64:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: ddivu $zero, $4, $5
+; MIPS3-NEXT: teq $5, $zero, 7
+; MIPS3-NEXT: mfhi $2
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: nop
+;
; GP64-LABEL: urem_i64:
; GP64: # %bb.0: # %entry
; GP64-NEXT: ddivu $zero, $4, $5
@@ -364,6 +458,30 @@ entry:
}
define signext i128 @urem_i128(i128 signext %a, i128 signext %b) {
+; MIPS2-LABEL: urem_i128:
+; MIPS2: # %bb.0: # %entry
+; MIPS2-NEXT: lui $2, %hi(_gp_disp)
+; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp)
+; MIPS2-NEXT: addiu $sp, $sp, -40
+; MIPS2-NEXT: .cfi_def_cfa_offset 40
+; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS2-NEXT: .cfi_offset 31, -4
+; MIPS2-NEXT: addu $gp, $2, $25
+; MIPS2-NEXT: lw $1, 60($sp)
+; MIPS2-NEXT: lw $2, 64($sp)
+; MIPS2-NEXT: lw $3, 68($sp)
+; MIPS2-NEXT: sw $3, 28($sp)
+; MIPS2-NEXT: sw $2, 24($sp)
+; MIPS2-NEXT: sw $1, 20($sp)
+; MIPS2-NEXT: lw $1, 56($sp)
+; MIPS2-NEXT: sw $1, 16($sp)
+; MIPS2-NEXT: lw $25, %call16(__umodti3)($gp)
+; MIPS2-NEXT: jalr $25
+; MIPS2-NEXT: nop
+; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS2-NEXT: jr $ra
+; MIPS2-NEXT: addiu $sp, $sp, 40
+;
; GP32-LABEL: urem_i128:
; GP32: # %bb.0: # %entry
; GP32-NEXT: lui $2, %hi(_gp_disp)
@@ -411,6 +529,25 @@ define signext i128 @urem_i128(i128 signext %a, i128 signext %b) {
; GP32R6-NEXT: jr $ra
; GP32R6-NEXT: addiu $sp, $sp, 40
;
+; MIPS3-LABEL: urem_i128:
+; MIPS3: # %bb.0: # %entry
+; MIPS3-NEXT: daddiu $sp, $sp, -16
+; MIPS3-NEXT: .cfi_def_cfa_offset 16
+; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS3-NEXT: .cfi_offset 31, -8
+; MIPS3-NEXT: .cfi_offset 28, -16
+; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(urem_i128)))
+; MIPS3-NEXT: daddu $1, $1, $25
+; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(urem_i128)))
+; MIPS3-NEXT: ld $25, %call16(__umodti3)($gp)
+; MIPS3-NEXT: jalr $25
+; MIPS3-NEXT: nop
+; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS3-NEXT: jr $ra
+; MIPS3-NEXT: daddiu $sp, $sp, 16
+;
; GP64-LABEL: urem_i128:
; GP64: # %bb.0: # %entry
; GP64-NEXT: daddiu $sp, $sp, -16
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
new file mode 100644
index 000000000000..82eb5fb71677
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: fence_sc_cluster
+define void @fence_sc_cluster() local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ fence syncscope("cluster") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_cluster
+define void @fence_acq_rel_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_cluster
+define void @fence_release_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_cluster
+define void @fence_acquire_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") acquire
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
index d3aace95e966..626685f82f32 100644
--- a/llvm/test/CodeGen/NVPTX/fence.ll
+++ b/llvm/test/CodeGen/NVPTX/fence.ll
@@ -3,6 +3,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
+; TODO: implement and test thread scope.
+
; CHECK-LABEL: fence_sc_sys
define void @fence_sc_sys() local_unnamed_addr {
; SM60: membar.sys
@@ -16,21 +18,85 @@ define void @fence_acq_rel_sys() local_unnamed_addr {
; SM60: membar.sys
; SM70: fence.acq_rel.sys
fence acq_rel
- ret void
+ ret void
}
; CHECK-LABEL: fence_release_sys
define void @fence_release_sys() local_unnamed_addr {
; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
+ ; SM70: fence.acq_rel.sys
fence release
- ret void
+ ret void
}
; CHECK-LABEL: fence_acquire_sys
define void @fence_acquire_sys() local_unnamed_addr {
; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
+ ; SM70: fence.acq_rel.sys
fence acquire
- ret void
+ ret void
+}
+
+; CHECK-LABEL: fence_sc_gpu
+define void @fence_sc_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.sc.gpu
+ fence syncscope("device") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_gpu
+define void @fence_acq_rel_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_gpu
+define void @fence_release_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_gpu
+define void @fence_acquire_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") acquire
+ ret void
+}
+
+; CHECK-LABEL: fence_sc_cta
+define void @fence_sc_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.sc.cta
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_cta
+define void @fence_acq_rel_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_cta
+define void @fence_release_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_cta
+define void @fence_acquire_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") acquire
+ ret void
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 9cea33d12027..4b200eacb0cf 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -1,10 +1,367 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
+; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
+; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile release": generates "st.release.sys;"
+; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
+
+; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
+; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
+; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
+
+; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+
+; TODO: add test for vectors that exceed 128-bit length
+; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: implement and test thread scope.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
;; generic statespace
-; CHECK-LABEL: generic_acq_rel
-define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_gpu
+define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_gpu
+define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_cta
+define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_cta
+define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_gpu
+define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_gpu
+define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_cta
+define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_cta
+define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_sys
+define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -31,7 +388,7 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e release, align 4
@@ -44,8 +401,8 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
ret void
}
-; CHECK-LABEL: generic_acq_rel_volatile
-define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_volatile_sys
+define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -72,7 +429,7 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr %e release, align 4
@@ -85,8 +442,172 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo
ret void
}
-; CHECK-LABEL: generic_sc
-define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_gpu
+define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_gpu
+define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_cta
+define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_cta
+define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_sys
+define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a seq_cst, align 1
@@ -122,7 +643,7 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e seq_cst, align 4
@@ -138,8 +659,8 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
ret void
}
-; CHECK-LABEL: generic_sc_volatile
-define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_sc_volatile_sys
+define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr %a seq_cst, align 1
@@ -175,7 +696,7 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr %e seq_cst, align 4
@@ -191,10 +712,550 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
ret void
}
+; CHECK-LABEL: generic_sc_gpu
+define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_volatile_gpu
+define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_cta
+define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_volatile_cta
+define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
;; global statespace
-; CHECK-LABEL: global_acq_rel
-define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_gpu
+define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile_gpu
+define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_cta
+define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile_cta
+define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_gpu
+define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_gpu
+define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_cta
+define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_cta
+define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_sys
+define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -221,7 +1282,7 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp
; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(1) %e release, align 4
@@ -234,8 +1295,8 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp
ret void
}
-; CHECK-LABEL: global_acq_rel_volatile
-define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_acq_rel_volatile_sys
+define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -262,7 +1323,7 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p
; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
@@ -275,8 +1336,172 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p
ret void
}
-; CHECK-LABEL: global_seq_cst
-define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_acq_rel_gpu
+define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_gpu
+define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_cta
+define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_cta
+define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_sys
+define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
@@ -312,7 +1537,7 @@ define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
@@ -328,8 +1553,8 @@ define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp
ret void
}
-; CHECK-LABEL: global_seq_cst_volatile
-define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_seq_cst_volatile_sys
+define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
@@ -365,7 +1590,7 @@ define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
@@ -381,10 +1606,550 @@ define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p
ret void
}
+; CHECK-LABEL: global_seq_cst_gpu
+define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_gpu
+define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_cta
+define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_cta
+define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
;; shared statespace
-; CHECK-LABEL: shared_acq_rel
-define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_unordered_gpu
+define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_gpu
+define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_cta
+define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_cta
+define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_gpu
+define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_gpu
+define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_cta
+define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_cta
+define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_sys
+define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -411,7 +2176,7 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp
; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(3) %e release, align 4
@@ -424,8 +2189,8 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp
ret void
}
-; CHECK-LABEL: shared_acq_rel_volatile
-define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_acq_rel_volatile_sys
+define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -452,7 +2217,7 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p
; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
@@ -465,8 +2230,172 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p
ret void
}
-; CHECK-LABEL: shared_seq_cst
-define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_acq_rel_gpu
+define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_gpu
+define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_cta
+define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_cta
+define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_sys
+define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
@@ -502,7 +2431,7 @@ define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
@@ -510,16 +2439,16 @@ define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
%f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
ret void
}
-; CHECK-LABEL: shared_seq_cst_volatile
-define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_seq_cst_volatile_sys
+define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
@@ -555,7 +2484,7 @@ define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
@@ -571,13 +2500,550 @@ define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p
ret void
}
+; CHECK-LABEL: shared_seq_cst_gpu
+define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_gpu
+define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_cta
+define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_cta
+define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
;; local statespace
-; CHECK-LABEL: local_acq_rel
-define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
+; CHECK-LABEL: local_unordered_gpu
+define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_gpu
+define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_cta
+define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_cta
+define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_gpu
+define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_gpu
+define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_cta
+define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_cta
+define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_sys
+define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -604,7 +3070,7 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(5) %e release, align 4
@@ -617,11 +3083,8 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
ret void
}
-; CHECK-LABEL: local_acq_rel_volatile
-define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_acq_rel_volatile_sys
+define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -648,7 +3111,7 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
@@ -661,11 +3124,172 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
ret void
}
-; CHECK-LABEL: local_seq_cst
-define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
+; CHECK-LABEL: local_acq_rel_gpu
+define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_gpu
+define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_cta
+define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_cta
+define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_sys
+define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
@@ -692,7 +3316,7 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
@@ -705,11 +3329,8 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
ret void
}
-; CHECK-LABEL: local_seq_cst_volatile
-define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_seq_cst_volatile_sys
+define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
@@ -736,7 +3357,7 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
@@ -746,10 +3367,169 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
- ; TODO: LLVM IR Verifier does not support atomics on vector types.
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_gpu
+define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
ret void
}
-; TODO: add plain,atomic,volatile,atomic volatile tests
-; for .const and .param statespaces \ No newline at end of file
+; CHECK-LABEL: local_seq_cst_volatile_gpu
+define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_cta
+define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_volatile_cta
+define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
new file mode 100644
index 000000000000..645170da51a0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
@@ -0,0 +1,1423 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
+; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile release": generates "st.release.sys;"
+; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
+
+; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
+; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
+; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
+
+; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+
+; TODO: add test for vectors that exceed 128-bit length
+; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: implement and test thread scope.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .shared.sys into .shared.cta or .shared.cluster .
+
+;; generic statespace
+
+; CHECK-LABEL: generic_unordered_cluster
+define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_cluster
+define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_cluster
+define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_cluster
+define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_cluster
+define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_cluster
+define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_cluster
+define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_volatile_cluster
+define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_unordered_cluster
+define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile_cluster
+define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_cluster
+define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_cluster
+define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_cluster
+define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_cluster
+define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_cluster
+define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_cluster
+define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; shared
+
+; CHECK-LABEL: shared_unordered_cluster
+define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_cluster
+define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_cluster
+define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_cluster
+define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_cluster
+define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_cluster
+define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_cluster
+define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_cluster
+define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_unordered_cluster
+define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_cluster
+define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_cluster
+define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_cluster
+define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_cluster
+define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_cluster
+define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_cluster
+define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_volatile_cluster
+define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index aac73f71a676..f922fd92fa24 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -9,10 +9,21 @@
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
; generic statespace
-; CHECK-LABEL: generic_plain
-define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: generic_weak
+define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr %a
%a.add = add i8 %a.load, 1
@@ -238,198 +249,198 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
ret void
}
-; CHECK-LABEL: generic_monotonic
-define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_sys
+define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a monotonic, align 1
+ %a.load = load atomic i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a monotonic, align 1
+ store atomic i8 %a.add, ptr %a unordered, align 1
; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b monotonic, align 2
+ %b.load = load atomic i16, ptr %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b monotonic, align 2
+ store atomic i16 %b.add, ptr %b unordered, align 2
; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c monotonic, align 4
+ %c.load = load atomic i32, ptr %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c monotonic, align 4
+ store atomic i32 %c.add, ptr %c unordered, align 4
; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d monotonic, align 8
+ %d.load = load atomic i64, ptr %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d monotonic, align 8
+ store atomic i64 %d.add, ptr %d unordered, align 8
; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.load = load atomic float, ptr %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e monotonic, align 4
+ store atomic float %e.add, ptr %e unordered, align 4
; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e monotonic, align 8
+ %f.load = load atomic double, ptr %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e monotonic, align 8
+ store atomic double %f.add, ptr %e unordered, align 8
ret void
}
-; CHECK-LABEL: generic_monotonic_volatile
-define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_volatile_sys
+define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr %a unordered, align 1
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr %b unordered, align 2
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr %c unordered, align 4
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr %d unordered, align 8
; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e monotonic, align 4
+ store atomic volatile float %e.add, ptr %e unordered, align 4
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e monotonic, align 8
+ store atomic volatile double %f.add, ptr %e unordered, align 8
ret void
}
-; CHECK-LABEL: generic_unordered
-define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic_sys
+define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a unordered, align 1
+ %a.load = load atomic i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a unordered, align 1
+ store atomic i8 %a.add, ptr %a monotonic, align 1
; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b unordered, align 2
+ %b.load = load atomic i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b unordered, align 2
+ store atomic i16 %b.add, ptr %b monotonic, align 2
; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c unordered, align 4
+ %c.load = load atomic i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c unordered, align 4
+ store atomic i32 %c.add, ptr %c monotonic, align 4
; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d unordered, align 8
+ %d.load = load atomic i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d unordered, align 8
+ store atomic i64 %d.add, ptr %d monotonic, align 8
; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e unordered, align 4
+ store atomic float %e.add, ptr %e monotonic, align 4
; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e unordered, align 8
+ %f.load = load atomic double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e unordered, align 8
+ store atomic double %f.add, ptr %e monotonic, align 8
ret void
}
-; CHECK-LABEL: generic_unordered_volatile
-define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic_volatile_sys
+define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr %a monotonic, align 1
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr %b monotonic, align 2
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr %c monotonic, align 4
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr %d monotonic, align 8
; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e unordered, align 4
+ store atomic volatile float %e.add, ptr %e monotonic, align 4
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e unordered, align 8
+ %f.load = load atomic volatile double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e unordered, align 8
+ store atomic volatile double %f.add, ptr %e monotonic, align 8
ret void
}
;; global statespace
-; CHECK-LABEL: global_plain
-define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+; CHECK-LABEL: global_weak
+define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(1) %a
%a.add = add i8 %a.load, 1
@@ -630,222 +641,222 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs
ret void
}
-; CHECK-LABEL: global_monotonic
-define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_sys
+define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
ret void
}
-; CHECK-LABEL: global_monotonic_volatile
-define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_volatile_sys
+define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
ret void
}
-; CHECK-LABEL: global_unordered
-define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_monotonic_sys
+define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: global_unordered_volatile
-define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_monotonic_volatile_sys
+define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
ret void
}
;; shared statespace
-; CHECK-LABEL: shared_plain
-define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+; CHECK-LABEL: shared_weak
+define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(3) %a
%a.add = add i8 %a.load, 1
@@ -1046,202 +1057,198 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs
ret void
}
-; CHECK-LABEL: shared_monotonic
-define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared.
-
+; CHECK-LABEL: shared_unordered_sys
+define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
ret void
}
-; CHECK-LABEL: shared_monotonic_volatile
-define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_unordered_volatile_sys
+define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
ret void
}
-; CHECK-LABEL: shared_unordered
-define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared.
-
+; CHECK-LABEL: shared_monotonic_sys
+define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: shared_unordered_volatile
-define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_monotonic_volatile_sys
+define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
ret void
}
;; local statespace
-; CHECK-LABEL: local_plain
-define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+; CHECK-LABEL: local_weak
+define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -1343,9 +1350,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace
; CHECK-LABEL: local_volatile
define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using volatile operations.
-
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load volatile i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -1445,175 +1449,166 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp
ret void
}
-; CHECK-LABEL: local_monotonic
-define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_unordered_sys
+define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
ret void
}
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by generating atomic or volatile operations
-
+; CHECK-LABEL: local_unordered_volatile_sys
+define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
ret void
}
-; CHECK-LABEL: local_unordered
-define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_monotonic_sys
+define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: local_unordered_volatile
-define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
ret void
}
-
-; TODO: add plain,atomic,volatile,atomic volatile tests
-; for .const and .param statespaces \ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 20c7ae5908d2..9ec5bcd13403 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -9,26 +9,29 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32)
declare i64 @llvm.nvvm.rotate.b64(i64, i32)
declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+
; SM20: rotate32
; SM35: rotate32
define i32 @rotate32(i32 %a, i32 %b) {
; SM20-LABEL: rotate32(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<4>;
+; SM20-NEXT: .reg .b32 %r<9>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b32 %lhs;
-; SM20-NEXT: .reg .b32 %rhs;
-; SM20-NEXT: .reg .b32 %amt2;
-; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
-; SM20-NEXT: sub.s32 %amt2, 32, %r2;
-; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
-; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
+; SM20-NEXT: and.b32 %r3, %r2, 31;
+; SM20-NEXT: shl.b32 %r4, %r1, %r3;
+; SM20-NEXT: neg.s32 %r5, %r2;
+; SM20-NEXT: and.b32 %r6, %r5, 31;
+; SM20-NEXT: shr.u32 %r7, %r1, %r6;
+; SM20-NEXT: or.b32 %r8, %r4, %r7;
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r8;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotate32(
@@ -50,45 +53,36 @@ define i32 @rotate32(i32 %a, i32 %b) {
define i64 @rotate64(i64 %a, i32 %b) {
; SM20-LABEL: rotate64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<2>;
-; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b64 %lhs;
-; SM20-NEXT: .reg .b64 %rhs;
-; SM20-NEXT: .reg .u32 %amt2;
-; SM20-NEXT: and.b32 %amt2, %r1, 63;
-; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
-; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
-; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
-; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: and.b32 %r2, %r1, 63;
+; SM20-NEXT: shl.b64 %rd2, %rd1, %r2;
+; SM20-NEXT: neg.s32 %r3, %r1;
+; SM20-NEXT: and.b32 %r4, %r3, 63;
+; SM20-NEXT: shr.u64 %rd3, %rd1, %r4;
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotate64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<6>;
-; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b32 %dummy;
-; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
-; SM35-NEXT: }
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b32 %dummy;
-; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
-; SM35-NEXT: }
-; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1];
-; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
-; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3;
-; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ld.param.u32 %r1, [rotate64_param_1];
+; SM35-NEXT: and.b32 %r2, %r1, 63;
+; SM35-NEXT: shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT: neg.s32 %r3, %r1;
+; SM35-NEXT: and.b32 %r4, %r3, 63;
+; SM35-NEXT: shr.u64 %rd3, %rd1, %r4;
+; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
ret i64 %val
@@ -99,45 +93,36 @@ define i64 @rotate64(i64 %a, i32 %b) {
define i64 @rotateright64(i64 %a, i32 %b) {
; SM20-LABEL: rotateright64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<2>;
-; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b64 %lhs;
-; SM20-NEXT: .reg .b64 %rhs;
-; SM20-NEXT: .reg .u32 %amt2;
-; SM20-NEXT: and.b32 %amt2, %r1, 63;
-; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
-; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
-; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
-; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: and.b32 %r2, %r1, 63;
+; SM20-NEXT: shr.u64 %rd2, %rd1, %r2;
+; SM20-NEXT: neg.s32 %r3, %r1;
+; SM20-NEXT: and.b32 %r4, %r3, 63;
+; SM20-NEXT: shl.b64 %rd3, %rd1, %r4;
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotateright64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<6>;
-; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b32 %dummy;
-; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1;
-; SM35-NEXT: }
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b32 %dummy;
-; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1;
-; SM35-NEXT: }
-; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1];
-; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
-; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3;
-; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
+; SM35-NEXT: and.b32 %r2, %r1, 63;
+; SM35-NEXT: shr.u64 %rd2, %rd1, %r2;
+; SM35-NEXT: neg.s32 %r3, %r1;
+; SM35-NEXT: and.b32 %r4, %r3, 63;
+; SM35-NEXT: shl.b64 %rd3, %rd1, %r4;
+; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
ret i64 %val
@@ -148,18 +133,14 @@ define i64 @rotateright64(i64 %a, i32 %b) {
define i32 @rotl0(i32 %x) {
; SM20-LABEL: rotl0(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<3>;
+; SM20-NEXT: .reg .b32 %r<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b32 %lhs;
-; SM20-NEXT: .reg .b32 %rhs;
-; SM20-NEXT: shl.b32 %lhs, %r1, 8;
-; SM20-NEXT: shr.b32 %rhs, %r1, 24;
-; SM20-NEXT: add.u32 %r2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b32 [func_retval0+0], %r2;
+; SM20-NEXT: shr.u32 %r2, %r1, 24;
+; SM20-NEXT: shl.b32 %r3, %r1, 8;
+; SM20-NEXT: or.b32 %r4, %r3, %r2;
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl0(
@@ -177,51 +158,40 @@ define i32 @rotl0(i32 %x) {
ret i32 %t2
}
-declare i64 @llvm.fshl.i64(i64, i64, i64)
-declare i64 @llvm.fshr.i64(i64, i64, i64)
-
; SM35: rotl64
define i64 @rotl64(i64 %a, i64 %n) {
; SM20-LABEL: rotl64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<2>;
-; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b64 %lhs;
-; SM20-NEXT: .reg .b64 %rhs;
-; SM20-NEXT: .reg .u32 %amt2;
-; SM20-NEXT: and.b32 %amt2, %r1, 63;
-; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
-; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
-; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
-; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: and.b32 %r2, %r1, 63;
+; SM20-NEXT: shl.b64 %rd2, %rd1, %r2;
+; SM20-NEXT: neg.s32 %r3, %r1;
+; SM20-NEXT: and.b32 %r4, %r3, 63;
+; SM20-NEXT: shr.u64 %rd3, %rd1, %r4;
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<2>;
-; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1];
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b64 %lhs;
-; SM35-NEXT: .reg .b64 %rhs;
-; SM35-NEXT: .reg .u32 %amt2;
-; SM35-NEXT: and.b32 %amt2, %r1, 63;
-; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2;
-; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
-; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2;
-; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM35-NEXT: }
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: and.b32 %r2, %r1, 63;
+; SM35-NEXT: shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT: neg.s32 %r3, %r1;
+; SM35-NEXT: and.b32 %r4, %r3, 63;
+; SM35-NEXT: shr.u64 %rd3, %rd1, %r4;
+; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
@@ -231,34 +201,26 @@ define i64 @rotl64(i64 %a, i64 %n) {
define i64 @rotl64_imm(i64 %a) {
; SM20-LABEL: rotl64_imm(
; SM20: {
-; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b64 %lhs;
-; SM20-NEXT: .reg .b64 %rhs;
-; SM20-NEXT: shl.b64 %lhs, %rd1, 2;
-; SM20-NEXT: shr.b64 %rhs, %rd1, 62;
-; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: shr.u64 %rd2, %rd1, 62;
+; SM20-NEXT: shl.b64 %rd3, %rd1, 2;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl64_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b64 %lhs;
-; SM35-NEXT: .reg .b64 %rhs;
-; SM35-NEXT: shl.b64 %lhs, %rd1, 2;
-; SM35-NEXT: shr.b64 %rhs, %rd1, 62;
-; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM35-NEXT: }
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: shr.u64 %rd2, %rd1, 62;
+; SM35-NEXT: shl.b64 %rd3, %rd1, 2;
+; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
@@ -268,44 +230,36 @@ define i64 @rotl64_imm(i64 %a) {
define i64 @rotr64(i64 %a, i64 %n) {
; SM20-LABEL: rotr64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<2>;
-; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b64 %lhs;
-; SM20-NEXT: .reg .b64 %rhs;
-; SM20-NEXT: .reg .u32 %amt2;
-; SM20-NEXT: and.b32 %amt2, %r1, 63;
-; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
-; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
-; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
-; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: and.b32 %r2, %r1, 63;
+; SM20-NEXT: shr.u64 %rd2, %rd1, %r2;
+; SM20-NEXT: neg.s32 %r3, %r1;
+; SM20-NEXT: and.b32 %r4, %r3, 63;
+; SM20-NEXT: shl.b64 %rd3, %rd1, %r4;
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotr64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<2>;
-; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1];
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b64 %lhs;
-; SM35-NEXT: .reg .b64 %rhs;
-; SM35-NEXT: .reg .u32 %amt2;
-; SM35-NEXT: and.b32 %amt2, %r1, 63;
-; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2;
-; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
-; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2;
-; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM35-NEXT: }
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: and.b32 %r2, %r1, 63;
+; SM35-NEXT: shr.u64 %rd2, %rd1, %r2;
+; SM35-NEXT: neg.s32 %r3, %r1;
+; SM35-NEXT: and.b32 %r4, %r3, 63;
+; SM35-NEXT: shl.b64 %rd3, %rd1, %r4;
+; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
@@ -315,35 +269,180 @@ define i64 @rotr64(i64 %a, i64 %n) {
define i64 @rotr64_imm(i64 %a) {
; SM20-LABEL: rotr64_imm(
; SM20: {
-; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM20-NEXT: {
-; SM20-NEXT: .reg .b64 %lhs;
-; SM20-NEXT: .reg .b64 %rhs;
-; SM20-NEXT: shl.b64 %lhs, %rd1, 62;
-; SM20-NEXT: shr.b64 %rhs, %rd1, 2;
-; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT: }
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: shl.b64 %rd2, %rd1, 62;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 2;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotr64_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-NEXT: .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM35-NEXT: {
-; SM35-NEXT: .reg .b64 %lhs;
-; SM35-NEXT: .reg .b64 %rhs;
-; SM35-NEXT: shl.b64 %lhs, %rd1, 62;
-; SM35-NEXT: shr.b64 %rhs, %rd1, 2;
-; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
-; SM35-NEXT: }
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: shl.b64 %rd2, %rd1, 62;
+; SM35-NEXT: shr.u64 %rd3, %rd1, 2;
+; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}
+
+define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) {
+; SM20-LABEL: funnel_shift_right_32(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<11>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0];
+; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_2];
+; SM20-NEXT: and.b32 %r3, %r2, 31;
+; SM20-NEXT: ld.param.u32 %r4, [funnel_shift_right_32_param_1];
+; SM20-NEXT: shr.u32 %r5, %r4, %r3;
+; SM20-NEXT: shl.b32 %r6, %r1, 1;
+; SM20-NEXT: not.b32 %r7, %r2;
+; SM20-NEXT: and.b32 %r8, %r7, 31;
+; SM20-NEXT: shl.b32 %r9, %r6, %r8;
+; SM20-NEXT: or.b32 %r10, %r9, %r5;
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r10;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: funnel_shift_right_32(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0];
+; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1];
+; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2];
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, %r3;
+; SM35-NEXT: st.param.b32 [func_retval0+0], %r4;
+; SM35-NEXT: ret;
+ %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
+ ret i32 %val
+}
+
+define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) {
+; SM20-LABEL: funnel_shift_left_32(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<11>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0];
+; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_2];
+; SM20-NEXT: and.b32 %r3, %r2, 31;
+; SM20-NEXT: shl.b32 %r4, %r1, %r3;
+; SM20-NEXT: ld.param.u32 %r5, [funnel_shift_left_32_param_1];
+; SM20-NEXT: shr.u32 %r6, %r5, 1;
+; SM20-NEXT: not.b32 %r7, %r2;
+; SM20-NEXT: and.b32 %r8, %r7, 31;
+; SM20-NEXT: shr.u32 %r9, %r6, %r8;
+; SM20-NEXT: or.b32 %r10, %r4, %r9;
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r10;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: funnel_shift_left_32(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0];
+; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1];
+; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2];
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, %r3;
+; SM35-NEXT: st.param.b32 [func_retval0+0], %r4;
+; SM35-NEXT: ret;
+ %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+ ret i32 %val
+}
+
+define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) {
+; SM20-LABEL: funnel_shift_right_64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<7>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2];
+; SM20-NEXT: and.b32 %r2, %r1, 63;
+; SM20-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, %r2;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 1;
+; SM20-NEXT: not.b32 %r3, %r1;
+; SM20-NEXT: and.b32 %r4, %r3, 63;
+; SM20-NEXT: shl.b64 %rd5, %rd4, %r4;
+; SM20-NEXT: or.b64 %rd6, %rd5, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: funnel_shift_right_64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<7>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0];
+; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2];
+; SM35-NEXT: and.b32 %r2, %r1, 63;
+; SM35-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1];
+; SM35-NEXT: shr.u64 %rd3, %rd2, %r2;
+; SM35-NEXT: shl.b64 %rd4, %rd1, 1;
+; SM35-NEXT: not.b32 %r3, %r1;
+; SM35-NEXT: and.b32 %r4, %r3, 63;
+; SM35-NEXT: shl.b64 %rd5, %rd4, %r4;
+; SM35-NEXT: or.b64 %rd6, %rd5, %rd3;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
+ ret i64 %val
+}
+
+define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) {
+; SM20-LABEL: funnel_shift_left_64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b64 %rd<7>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2];
+; SM20-NEXT: and.b32 %r2, %r1, 63;
+; SM20-NEXT: shl.b64 %rd2, %rd1, %r2;
+; SM20-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
+; SM20-NEXT: shr.u64 %rd4, %rd3, 1;
+; SM20-NEXT: not.b32 %r3, %r1;
+; SM20-NEXT: and.b32 %r4, %r3, 63;
+; SM20-NEXT: shr.u64 %rd5, %rd4, %r4;
+; SM20-NEXT: or.b64 %rd6, %rd2, %rd5;
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: funnel_shift_left_64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<7>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
+; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2];
+; SM35-NEXT: and.b32 %r2, %r1, 63;
+; SM35-NEXT: shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
+; SM35-NEXT: shr.u64 %rd4, %rd3, 1;
+; SM35-NEXT: not.b32 %r3, %r1;
+; SM35-NEXT: and.b32 %r4, %r3, 63;
+; SM35-NEXT: shr.u64 %rd5, %rd4, %r4;
+; SM35-NEXT: or.b64 %rd6, %rd2, %rd5;
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
+ ret i64 %val
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll
index 64659ce1b5c5..05fdb02ac747 100644
--- a/llvm/test/CodeGen/NVPTX/rotate_64.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll
@@ -1,25 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
declare i64 @llvm.nvvm.rotate.b64(i64, i32)
declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
-; CHECK: rotate64
define i64 @rotate64(i64 %a, i32 %b) {
-; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 3;
-; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61;
-; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]];
-; CHECK: ret
+; CHECK-LABEL: rotate64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
+; CHECK-NEXT: shr.u64 %rd2, %rd1, 61;
+; CHECK-NEXT: shl.b64 %rd3, %rd1, 3;
+; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; CHECK-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3)
ret i64 %val
}
-; CHECK: rotateright64
define i64 @rotateright64(i64 %a, i32 %b) {
-; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 61;
-; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 3;
-; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]];
-; CHECK: ret
+; CHECK-LABEL: rotateright64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
+; CHECK-NEXT: shl.b64 %rd2, %rd1, 61;
+; CHECK-NEXT: shr.u64 %rd3, %rd1, 3;
+; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; CHECK-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3)
ret i64 %val
}
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
index c48361e0a803..72de456cba39 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll
@@ -8,58 +8,52 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 8, 2048
; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: li 7, 2048
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 7
-; CHECK-NEXT: mtctr 8
-; CHECK-NEXT: addi 8, 1, 16
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: mtctr 7
+; CHECK-NEXT: addi 7, 1, 16
; CHECK-NEXT: .LBB0_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 9, 0(4)
-; CHECK-NEXT: lwz 10, 4(4)
-; CHECK-NEXT: lwz 11, 8(4)
-; CHECK-NEXT: lwz 12, 12(4)
-; CHECK-NEXT: lwz 0, 12(5)
+; CHECK-NEXT: lwz 8, 0(4)
+; CHECK-NEXT: lwz 9, 4(4)
+; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: lwz 12, 12(5)
; CHECK-NEXT: stw 6, 44(1)
; CHECK-NEXT: stw 6, 40(1)
; CHECK-NEXT: stw 6, 36(1)
; CHECK-NEXT: stw 6, 32(1)
-; CHECK-NEXT: stw 12, 28(1)
-; CHECK-NEXT: clrlwi 12, 0, 29
-; CHECK-NEXT: stw 11, 24(1)
-; CHECK-NEXT: nand 11, 0, 7
-; CHECK-NEXT: stw 10, 20(1)
-; CHECK-NEXT: subfic 29, 12, 32
-; CHECK-NEXT: stw 9, 16(1)
-; CHECK-NEXT: rlwinm 9, 0, 29, 28, 31
-; CHECK-NEXT: lwzux 10, 9, 8
-; CHECK-NEXT: clrlwi 11, 11, 27
-; CHECK-NEXT: lwz 0, 8(9)
-; CHECK-NEXT: slw 10, 10, 12
-; CHECK-NEXT: lwz 30, 4(9)
-; CHECK-NEXT: lwz 9, 12(9)
-; CHECK-NEXT: slw 28, 30, 12
-; CHECK-NEXT: srw 30, 30, 29
-; CHECK-NEXT: srw 29, 9, 29
-; CHECK-NEXT: slw 9, 9, 12
-; CHECK-NEXT: slw 12, 0, 12
-; CHECK-NEXT: srwi 0, 0, 1
-; CHECK-NEXT: stw 9, 12(3)
-; CHECK-NEXT: or 9, 12, 29
-; CHECK-NEXT: srw 11, 0, 11
-; CHECK-NEXT: stw 9, 8(3)
-; CHECK-NEXT: or 9, 10, 30
-; CHECK-NEXT: stw 9, 0(3)
-; CHECK-NEXT: or 9, 28, 11
-; CHECK-NEXT: stw 9, 4(3)
+; CHECK-NEXT: stw 11, 28(1)
+; CHECK-NEXT: stw 10, 24(1)
+; CHECK-NEXT: clrlwi 10, 12, 27
+; CHECK-NEXT: stw 9, 20(1)
+; CHECK-NEXT: stw 8, 16(1)
+; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
+; CHECK-NEXT: lwzux 9, 8, 7
+; CHECK-NEXT: subfic 12, 10, 32
+; CHECK-NEXT: lwz 11, 8(8)
+; CHECK-NEXT: slw 9, 9, 10
+; CHECK-NEXT: lwz 0, 4(8)
+; CHECK-NEXT: lwz 8, 12(8)
+; CHECK-NEXT: srw 30, 11, 12
+; CHECK-NEXT: slw 29, 0, 10
+; CHECK-NEXT: srw 0, 0, 12
+; CHECK-NEXT: srw 12, 8, 12
+; CHECK-NEXT: slw 11, 11, 10
+; CHECK-NEXT: slw 8, 8, 10
+; CHECK-NEXT: stw 8, 12(3)
+; CHECK-NEXT: or 8, 11, 12
+; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: or 8, 9, 0
+; CHECK-NEXT: stw 8, 0(3)
+; CHECK-NEXT: or 8, 29, 30
+; CHECK-NEXT: stw 8, 4(3)
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.end
; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: blr
entry:
@@ -83,59 +77,53 @@ for.end: ; preds = %for.body
define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 2048
-; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 7
-; CHECK-NEXT: mtctr 7
-; CHECK-NEXT: addi 7, 1, 36
+; CHECK-NEXT: stwu 1, -48(1)
+; CHECK-NEXT: stw 30, 40(1) # 4-byte Folded Spill
+; CHECK-NEXT: li 6, 2048
+; CHECK-NEXT: mtctr 6
+; CHECK-NEXT: addi 6, 1, 24
; CHECK-NEXT: .LBB1_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 8, 0(4)
-; CHECK-NEXT: lwz 10, 8(4)
-; CHECK-NEXT: lwz 12, 12(5)
-; CHECK-NEXT: lwz 9, 4(4)
-; CHECK-NEXT: lwz 11, 12(4)
-; CHECK-NEXT: stw 10, 44(1)
-; CHECK-NEXT: rlwinm 10, 12, 29, 28, 31
-; CHECK-NEXT: stw 8, 36(1)
-; CHECK-NEXT: srawi 8, 8, 31
-; CHECK-NEXT: stw 11, 48(1)
-; CHECK-NEXT: clrlwi 11, 12, 29
-; CHECK-NEXT: stw 9, 40(1)
-; CHECK-NEXT: nand 9, 12, 6
-; CHECK-NEXT: stw 8, 32(1)
-; CHECK-NEXT: subfic 30, 11, 32
+; CHECK-NEXT: lwz 7, 0(4)
+; CHECK-NEXT: lwz 8, 4(4)
+; CHECK-NEXT: lwz 11, 12(5)
+; CHECK-NEXT: lwz 9, 8(4)
+; CHECK-NEXT: lwz 10, 12(4)
; CHECK-NEXT: stw 8, 28(1)
-; CHECK-NEXT: clrlwi 9, 9, 27
-; CHECK-NEXT: stw 8, 24(1)
-; CHECK-NEXT: stw 8, 20(1)
-; CHECK-NEXT: sub 8, 7, 10
-; CHECK-NEXT: lwz 10, 4(8)
-; CHECK-NEXT: lwz 12, 8(8)
-; CHECK-NEXT: lwz 0, 0(8)
-; CHECK-NEXT: lwz 8, 12(8)
-; CHECK-NEXT: srw 29, 12, 11
-; CHECK-NEXT: slw 12, 12, 30
-; CHECK-NEXT: slw 30, 0, 30
-; CHECK-NEXT: srw 8, 8, 11
-; CHECK-NEXT: sraw 0, 0, 11
-; CHECK-NEXT: srw 11, 10, 11
-; CHECK-NEXT: slwi 10, 10, 1
-; CHECK-NEXT: or 8, 12, 8
-; CHECK-NEXT: slw 9, 10, 9
-; CHECK-NEXT: stw 8, 12(3)
-; CHECK-NEXT: or 8, 30, 11
-; CHECK-NEXT: stw 8, 4(3)
-; CHECK-NEXT: or 8, 29, 9
-; CHECK-NEXT: stw 0, 0(3)
-; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: rlwinm 8, 11, 29, 28, 29
+; CHECK-NEXT: stw 7, 24(1)
+; CHECK-NEXT: srawi 7, 7, 31
+; CHECK-NEXT: stw 10, 36(1)
+; CHECK-NEXT: clrlwi 10, 11, 27
+; CHECK-NEXT: stw 9, 32(1)
+; CHECK-NEXT: subfic 12, 10, 32
+; CHECK-NEXT: stw 7, 20(1)
+; CHECK-NEXT: stw 7, 16(1)
+; CHECK-NEXT: stw 7, 12(1)
+; CHECK-NEXT: stw 7, 8(1)
+; CHECK-NEXT: sub 7, 6, 8
+; CHECK-NEXT: lwz 8, 4(7)
+; CHECK-NEXT: lwz 9, 0(7)
+; CHECK-NEXT: lwz 11, 12(7)
+; CHECK-NEXT: srw 0, 8, 10
+; CHECK-NEXT: lwz 7, 8(7)
+; CHECK-NEXT: slw 30, 9, 12
+; CHECK-NEXT: slw 8, 8, 12
+; CHECK-NEXT: srw 11, 11, 10
+; CHECK-NEXT: slw 12, 7, 12
+; CHECK-NEXT: srw 7, 7, 10
+; CHECK-NEXT: or 7, 8, 7
+; CHECK-NEXT: stw 7, 8(3)
+; CHECK-NEXT: or 7, 12, 11
+; CHECK-NEXT: sraw 9, 9, 10
+; CHECK-NEXT: stw 7, 12(3)
+; CHECK-NEXT: or 7, 30, 0
+; CHECK-NEXT: stw 9, 0(3)
+; CHECK-NEXT: stw 7, 4(3)
; CHECK-NEXT: bdnz .LBB1_1
; CHECK-NEXT: # %bb.2: # %for.end
-; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: addi 1, 1, 64
+; CHECK-NEXT: lwz 30, 40(1) # 4-byte Folded Reload
+; CHECK-NEXT: addi 1, 1, 48
; CHECK-NEXT: blr
entry:
br label %for.body
@@ -159,59 +147,53 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 {
; CHECK-LABEL: foo3:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: stwu 1, -64(1)
-; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 8, 2048
; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: li 7, 2048
; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill
-; CHECK-NEXT: li 7, 7
-; CHECK-NEXT: mtctr 8
-; CHECK-NEXT: addi 8, 1, 32
+; CHECK-NEXT: li 6, 0
+; CHECK-NEXT: mtctr 7
+; CHECK-NEXT: addi 7, 1, 32
; CHECK-NEXT: .LBB2_1: # %for.body
; CHECK-NEXT: #
-; CHECK-NEXT: lwz 10, 4(4)
-; CHECK-NEXT: lwz 0, 12(5)
-; CHECK-NEXT: lwz 9, 0(4)
-; CHECK-NEXT: lwz 11, 8(4)
-; CHECK-NEXT: lwz 12, 12(4)
-; CHECK-NEXT: stw 10, 36(1)
-; CHECK-NEXT: rlwinm 10, 0, 29, 28, 31
+; CHECK-NEXT: lwz 8, 0(4)
+; CHECK-NEXT: lwz 12, 12(5)
+; CHECK-NEXT: lwz 9, 4(4)
+; CHECK-NEXT: lwz 10, 8(4)
+; CHECK-NEXT: lwz 11, 12(4)
+; CHECK-NEXT: stw 8, 32(1)
+; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29
; CHECK-NEXT: stw 6, 28(1)
-; CHECK-NEXT: sub 10, 8, 10
+; CHECK-NEXT: sub 8, 7, 8
; CHECK-NEXT: stw 6, 24(1)
; CHECK-NEXT: stw 6, 20(1)
; CHECK-NEXT: stw 6, 16(1)
-; CHECK-NEXT: stw 12, 44(1)
-; CHECK-NEXT: clrlwi 12, 0, 29
-; CHECK-NEXT: stw 11, 40(1)
-; CHECK-NEXT: subfic 29, 12, 32
-; CHECK-NEXT: stw 9, 32(1)
-; CHECK-NEXT: nand 9, 0, 7
-; CHECK-NEXT: lwz 11, 4(10)
-; CHECK-NEXT: clrlwi 9, 9, 27
-; CHECK-NEXT: lwz 0, 8(10)
-; CHECK-NEXT: lwz 30, 0(10)
-; CHECK-NEXT: lwz 10, 12(10)
-; CHECK-NEXT: srw 28, 0, 12
-; CHECK-NEXT: slw 0, 0, 29
-; CHECK-NEXT: slw 29, 30, 29
-; CHECK-NEXT: srw 10, 10, 12
-; CHECK-NEXT: srw 30, 30, 12
-; CHECK-NEXT: srw 12, 11, 12
-; CHECK-NEXT: slwi 11, 11, 1
-; CHECK-NEXT: slw 9, 11, 9
-; CHECK-NEXT: or 10, 0, 10
-; CHECK-NEXT: stw 10, 12(3)
-; CHECK-NEXT: or 10, 29, 12
-; CHECK-NEXT: or 9, 28, 9
-; CHECK-NEXT: stw 30, 0(3)
-; CHECK-NEXT: stw 10, 4(3)
-; CHECK-NEXT: stw 9, 8(3)
+; CHECK-NEXT: stw 11, 44(1)
+; CHECK-NEXT: clrlwi 11, 12, 27
+; CHECK-NEXT: stw 10, 40(1)
+; CHECK-NEXT: subfic 0, 11, 32
+; CHECK-NEXT: stw 9, 36(1)
+; CHECK-NEXT: lwz 9, 4(8)
+; CHECK-NEXT: lwz 10, 0(8)
+; CHECK-NEXT: lwz 12, 12(8)
+; CHECK-NEXT: srw 30, 9, 11
+; CHECK-NEXT: lwz 8, 8(8)
+; CHECK-NEXT: slw 29, 10, 0
+; CHECK-NEXT: slw 9, 9, 0
+; CHECK-NEXT: srw 12, 12, 11
+; CHECK-NEXT: slw 0, 8, 0
+; CHECK-NEXT: srw 8, 8, 11
+; CHECK-NEXT: or 8, 9, 8
+; CHECK-NEXT: stw 8, 8(3)
+; CHECK-NEXT: or 8, 0, 12
+; CHECK-NEXT: srw 10, 10, 11
+; CHECK-NEXT: stw 8, 12(3)
+; CHECK-NEXT: or 8, 29, 30
+; CHECK-NEXT: stw 10, 0(3)
+; CHECK-NEXT: stw 8, 4(3)
; CHECK-NEXT: bdnz .LBB2_1
; CHECK-NEXT: # %bb.2: # %for.end
; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload
; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload
-; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 1, 1, 64
; CHECK-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll
index 3e328c6ad9f0..d3ca1139b4fd 100644
--- a/llvm/test/CodeGen/PowerPC/pr59074.ll
+++ b/llvm/test/CodeGen/PowerPC/pr59074.ll
@@ -32,37 +32,36 @@ define void @pr59074(ptr %0) {
; LE32-NEXT: li 7, 0
; LE32-NEXT: li 8, 12
; LE32-NEXT: xxswapd 0, 0
+; LE32-NEXT: rlwimi 5, 6, 0, 30, 28
; LE32-NEXT: addi 4, 4, -12
-; LE32-NEXT: rlwinm 9, 4, 29, 28, 31
-; LE32-NEXT: stxvd2x 0, 6, 5
+; LE32-NEXT: rlwinm 9, 4, 29, 28, 29
+; LE32-NEXT: stxvd2x 0, 0, 5
; LE32-NEXT: stw 7, 44(1)
; LE32-NEXT: stw 7, 40(1)
; LE32-NEXT: stw 7, 36(1)
; LE32-NEXT: stw 8, 16(1)
+; LE32-NEXT: clrlwi 4, 4, 27
; LE32-NEXT: lwzux 5, 9, 6
-; LE32-NEXT: li 6, 7
-; LE32-NEXT: lwz 7, 8(9)
-; LE32-NEXT: nand 6, 4, 6
-; LE32-NEXT: lwz 8, 4(9)
-; LE32-NEXT: clrlwi 4, 4, 29
-; LE32-NEXT: lwz 9, 12(9)
-; LE32-NEXT: clrlwi 6, 6, 27
+; LE32-NEXT: lwz 6, 8(9)
+; LE32-NEXT: lwz 7, 4(9)
+; LE32-NEXT: lwz 8, 12(9)
+; LE32-NEXT: xori 9, 4, 31
; LE32-NEXT: subfic 11, 4, 32
; LE32-NEXT: srw 5, 5, 4
-; LE32-NEXT: slwi 10, 7, 1
-; LE32-NEXT: srw 7, 7, 4
-; LE32-NEXT: slw 6, 10, 6
-; LE32-NEXT: srw 10, 8, 4
-; LE32-NEXT: slw 8, 8, 11
-; LE32-NEXT: slw 11, 9, 11
-; LE32-NEXT: srw 4, 9, 4
-; LE32-NEXT: or 5, 8, 5
-; LE32-NEXT: or 7, 11, 7
-; LE32-NEXT: or 6, 10, 6
+; LE32-NEXT: slwi 10, 6, 1
+; LE32-NEXT: srw 6, 6, 4
+; LE32-NEXT: slw 9, 10, 9
+; LE32-NEXT: srw 10, 7, 4
+; LE32-NEXT: slw 7, 7, 11
+; LE32-NEXT: slw 11, 8, 11
+; LE32-NEXT: srw 4, 8, 4
+; LE32-NEXT: or 5, 7, 5
+; LE32-NEXT: or 6, 11, 6
+; LE32-NEXT: or 7, 10, 9
; LE32-NEXT: stw 4, 12(3)
-; LE32-NEXT: stw 7, 8(3)
+; LE32-NEXT: stw 6, 8(3)
; LE32-NEXT: stw 5, 0(3)
-; LE32-NEXT: stw 6, 4(3)
+; LE32-NEXT: stw 7, 4(3)
; LE32-NEXT: addi 1, 1, 80
; LE32-NEXT: blr
;
@@ -89,37 +88,33 @@ define void @pr59074(ptr %0) {
; BE32-NEXT: li 6, 12
; BE32-NEXT: li 7, 0
; BE32-NEXT: addi 8, 1, -48
-; BE32-NEXT: li 10, 7
; BE32-NEXT: stxvw4x 0, 0, 5
-; BE32-NEXT: addi 4, 4, -12
; BE32-NEXT: stw 6, -36(1)
+; BE32-NEXT: addi 4, 4, -12
; BE32-NEXT: stw 7, -40(1)
; BE32-NEXT: stw 7, -44(1)
-; BE32-NEXT: rlwinm 9, 4, 29, 28, 31
; BE32-NEXT: stw 7, -48(1)
+; BE32-NEXT: rlwinm 9, 4, 29, 28, 29
+; BE32-NEXT: clrlwi 4, 4, 27
; BE32-NEXT: sub 5, 8, 9
-; BE32-NEXT: nand 6, 4, 10
-; BE32-NEXT: clrlwi 4, 4, 29
-; BE32-NEXT: clrlwi 6, 6, 27
-; BE32-NEXT: lwz 7, 4(5)
-; BE32-NEXT: lwz 8, 8(5)
-; BE32-NEXT: lwz 9, 0(5)
-; BE32-NEXT: lwz 5, 12(5)
-; BE32-NEXT: slwi 10, 7, 1
-; BE32-NEXT: srw 11, 8, 4
-; BE32-NEXT: srw 7, 7, 4
-; BE32-NEXT: srw 5, 5, 4
-; BE32-NEXT: slw 6, 10, 6
+; BE32-NEXT: lwz 6, 4(5)
+; BE32-NEXT: lwz 7, 0(5)
+; BE32-NEXT: lwz 8, 12(5)
+; BE32-NEXT: lwz 5, 8(5)
; BE32-NEXT: subfic 10, 4, 32
-; BE32-NEXT: srw 4, 9, 4
-; BE32-NEXT: slw 8, 8, 10
-; BE32-NEXT: slw 10, 9, 10
-; BE32-NEXT: or 6, 11, 6
-; BE32-NEXT: or 7, 10, 7
-; BE32-NEXT: or 5, 8, 5
+; BE32-NEXT: srw 9, 6, 4
+; BE32-NEXT: slw 11, 7, 10
+; BE32-NEXT: srw 8, 8, 4
+; BE32-NEXT: slw 6, 6, 10
+; BE32-NEXT: slw 10, 5, 10
+; BE32-NEXT: srw 5, 5, 4
+; BE32-NEXT: srw 4, 7, 4
+; BE32-NEXT: or 7, 11, 9
+; BE32-NEXT: or 8, 10, 8
+; BE32-NEXT: or 5, 6, 5
; BE32-NEXT: stw 4, 0(3)
-; BE32-NEXT: stw 6, 8(3)
-; BE32-NEXT: stw 5, 12(3)
+; BE32-NEXT: stw 5, 8(3)
+; BE32-NEXT: stw 8, 12(3)
; BE32-NEXT: stw 7, 4(3)
; BE32-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
index f6fdb4ae2079..4f1b7bdc8b55 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -233,9 +233,96 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 6, 28(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: addi 3, 1, 32
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
+; LE-32BIT-NEXT: stw 9, 40(1)
+; LE-32BIT-NEXT: sub 3, 3, 6
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 48
+; LE-32BIT-NEXT: blr
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_16bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: ld 6, 8(3)
+; LE-64BIT-NEXT: ld 3, 0(3)
+; LE-64BIT-NEXT: slwi 4, 4, 5
+; LE-64BIT-NEXT: subfic 7, 4, 64
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sld 7, 6, 7
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: addi 7, 4, -64
+; LE-64BIT-NEXT: srd 4, 6, 4
+; LE-64BIT-NEXT: srd 7, 6, 7
+; LE-64BIT-NEXT: std 4, 8(5)
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: lshr_16bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: lwz 4, 12(4)
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 3, 8(3)
+; BE-NEXT: slwi 4, 4, 5
+; BE-NEXT: subfic 7, 4, 64
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: sld 7, 6, 7
+; BE-NEXT: addi 8, 4, -64
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: srd 7, 6, 8
+; BE-NEXT: srd 4, 6, 4
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: std 4, 0(5)
+; BE-NEXT: std 3, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: lshr_16bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -48(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 32
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: sub 3, 3, 4
; LE-32BIT-NEXT: stw 6, 24(1)
@@ -255,12 +342,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
%res = lshr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: shl_16bytes:
; LE-64BIT: # %bb.0:
@@ -309,7 +397,93 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: stw 6, 40(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: subfic 8, 4, 32
+; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: lwzux 3, 6, 3
+; LE-32BIT-NEXT: lwz 9, 4(6)
+; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 7, 8(6)
+; LE-32BIT-NEXT: lwz 6, 12(6)
+; LE-32BIT-NEXT: slw 11, 9, 4
+; LE-32BIT-NEXT: srw 9, 9, 8
+; LE-32BIT-NEXT: srw 10, 7, 8
+; LE-32BIT-NEXT: srw 8, 6, 8
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: or 3, 3, 9
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: or 4, 7, 8
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 48
+; LE-32BIT-NEXT: blr
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_16bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: ld 6, 0(3)
+; LE-64BIT-NEXT: ld 3, 8(3)
+; LE-64BIT-NEXT: slwi 4, 4, 5
+; LE-64BIT-NEXT: subfic 7, 4, 64
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: srd 7, 6, 7
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: addi 7, 4, -64
+; LE-64BIT-NEXT: sld 4, 6, 4
+; LE-64BIT-NEXT: sld 7, 6, 7
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: std 3, 8(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: shl_16bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: lwz 4, 12(4)
+; BE-NEXT: ld 6, 8(3)
+; BE-NEXT: ld 3, 0(3)
+; BE-NEXT: slwi 4, 4, 5
+; BE-NEXT: subfic 7, 4, 64
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: srd 7, 6, 7
+; BE-NEXT: addi 8, 4, -64
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: sld 7, 6, 8
+; BE-NEXT: sld 4, 6, 4
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: std 4, 8(5)
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: shl_16bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -48(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29
; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
@@ -328,12 +502,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
%res = shl i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_16bytes:
; LE-64BIT: # %bb.0:
@@ -361,17 +536,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: slwi 4, 4, 3
; BE-NEXT: addi 7, 4, -64
; BE-NEXT: cmpwi 7, 1
-; BE-NEXT: blt 0, .LBB8_2
+; BE-NEXT: blt 0, .LBB10_2
; BE-NEXT: # %bb.1:
; BE-NEXT: srad 3, 6, 7
-; BE-NEXT: b .LBB8_3
-; BE-NEXT: .LBB8_2:
+; BE-NEXT: b .LBB10_3
+; BE-NEXT: .LBB10_2:
; BE-NEXT: ld 3, 8(3)
; BE-NEXT: subfic 7, 4, 64
; BE-NEXT: sld 7, 6, 7
; BE-NEXT: srd 3, 3, 4
; BE-NEXT: or 3, 3, 7
-; BE-NEXT: .LBB8_3:
+; BE-NEXT: .LBB10_3:
; BE-NEXT: srad 4, 6, 4
; BE-NEXT: std 3, 8(5)
; BE-NEXT: std 4, 0(5)
@@ -388,7 +563,100 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: clrlwi 4, 4, 28
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 0, 28, 29
+; LE-32BIT-NEXT: stw 9, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: sraw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 48
+; LE-32BIT-NEXT: blr
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_16bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: ld 6, 8(3)
+; LE-64BIT-NEXT: ld 3, 0(3)
+; LE-64BIT-NEXT: slwi 4, 4, 5
+; LE-64BIT-NEXT: subfic 7, 4, 64
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sld 7, 6, 7
+; LE-64BIT-NEXT: or 3, 3, 7
+; LE-64BIT-NEXT: addi 7, 4, -64
+; LE-64BIT-NEXT: srad 4, 6, 4
+; LE-64BIT-NEXT: cmpwi 7, 1
+; LE-64BIT-NEXT: srad 8, 6, 7
+; LE-64BIT-NEXT: std 4, 8(5)
+; LE-64BIT-NEXT: isellt 3, 3, 8
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: ashr_16bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: lwz 4, 12(4)
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: slwi 4, 4, 5
+; BE-NEXT: addi 7, 4, -64
+; BE-NEXT: cmpwi 7, 1
+; BE-NEXT: blt 0, .LBB11_2
+; BE-NEXT: # %bb.1:
+; BE-NEXT: srad 3, 6, 7
+; BE-NEXT: b .LBB11_3
+; BE-NEXT: .LBB11_2:
+; BE-NEXT: ld 3, 8(3)
+; BE-NEXT: subfic 7, 4, 64
+; BE-NEXT: sld 7, 6, 7
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: or 3, 3, 7
+; BE-NEXT: .LBB11_3:
+; BE-NEXT: srad 4, 6, 4
+; BE-NEXT: std 3, 8(5)
+; BE-NEXT: std 4, 0(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: ashr_16bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -48(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: addi 6, 1, 32
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: srawi 3, 7, 31
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29
; LE-32BIT-NEXT: stw 9, 40(1)
; LE-32BIT-NEXT: stw 8, 36(1)
; LE-32BIT-NEXT: stw 7, 32(1)
@@ -408,8 +676,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
%res = ashr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
@@ -422,12 +690,324 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lxvd2x 1, 0, 3
; LE-64BIT-NEXT: xxlxor 2, 2, 2
; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: li 8, 32
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: lwz 3, 0(4)
+; LE-64BIT-NEXT: li 4, 48
+; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: stxvd2x 2, 7, 8
+; LE-64BIT-NEXT: rlwinm 4, 3, 0, 27, 28
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 26, 28
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: stxvd2x 1, 0, 7
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: subfic 7, 3, 64
+; LE-64BIT-NEXT: ld 8, 8(4)
+; LE-64BIT-NEXT: ld 9, 16(4)
+; LE-64BIT-NEXT: ld 4, 24(4)
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sld 10, 8, 7
+; LE-64BIT-NEXT: sld 11, 4, 7
+; LE-64BIT-NEXT: srd 8, 8, 3
+; LE-64BIT-NEXT: sld 7, 9, 7
+; LE-64BIT-NEXT: or 6, 10, 6
+; LE-64BIT-NEXT: srd 10, 9, 3
+; LE-64BIT-NEXT: srd 3, 4, 3
+; LE-64BIT-NEXT: or 7, 7, 8
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: std 6, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: lshr_32bytes:
+; BE: # %bb.0:
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 7, 8(3)
+; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 0, 27, 28
+; BE-NEXT: neg 3, 3
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: lshr_32bytes:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 28(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: sub 3, 3, 6
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: srw 28, 8, 4
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 0, 12
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: stw 3, 20(5)
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: li 8, 32
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: lwz 3, 0(4)
+; LE-64BIT-NEXT: li 4, 48
+; LE-64BIT-NEXT: stxvd2x 2, 7, 4
+; LE-64BIT-NEXT: stxvd2x 2, 7, 8
+; LE-64BIT-NEXT: rlwinm 4, 3, 2, 27, 28
+; LE-64BIT-NEXT: rlwinm 3, 3, 5, 26, 26
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: stxvd2x 1, 0, 7
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: subfic 7, 3, 64
+; LE-64BIT-NEXT: ld 8, 8(4)
+; LE-64BIT-NEXT: ld 9, 16(4)
+; LE-64BIT-NEXT: ld 4, 24(4)
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sld 10, 8, 7
+; LE-64BIT-NEXT: sld 11, 4, 7
+; LE-64BIT-NEXT: srd 8, 8, 3
+; LE-64BIT-NEXT: sld 7, 9, 7
+; LE-64BIT-NEXT: or 6, 10, 6
+; LE-64BIT-NEXT: srd 10, 9, 3
+; LE-64BIT-NEXT: srd 3, 4, 3
+; LE-64BIT-NEXT: or 7, 7, 8
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: std 6, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: lshr_32bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 7, 8(3)
+; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 2, 27, 28
+; BE-NEXT: neg 3, 3
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: rlwinm 4, 4, 5, 26, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srd 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: lshr_32bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT: stw 6, 44(1)
+; LE-32BIT-NEXT: sub 3, 3, 4
+; LE-32BIT-NEXT: stw 6, 40(1)
+; LE-32BIT-NEXT: stw 6, 36(1)
+; LE-32BIT-NEXT: stw 6, 32(1)
+; LE-32BIT-NEXT: stw 6, 28(1)
+; LE-32BIT-NEXT: stw 6, 24(1)
+; LE-32BIT-NEXT: stw 6, 20(1)
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 4, 4(3)
+; LE-32BIT-NEXT: lwz 6, 0(3)
+; LE-32BIT-NEXT: lwz 7, 12(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 16(3)
+; LE-32BIT-NEXT: lwz 11, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: stw 10, 16(5)
+; LE-32BIT-NEXT: stw 9, 20(5)
+; LE-32BIT-NEXT: stw 8, 8(5)
+; LE-32BIT-NEXT: stw 7, 12(5)
+; LE-32BIT-NEXT: stw 6, 0(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: lshr_32bytes_dwordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: lwz 3, 0(4)
; LE-64BIT-NEXT: li 4, 48
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
; LE-64BIT-NEXT: li 4, 32
-; LE-64BIT-NEXT: clrldi 3, 3, 59
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: stxvd2x 1, 0, 7
@@ -438,25 +1018,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: stxvd2x 0, 0, 5
; LE-64BIT-NEXT: blr
;
-; BE-LABEL: lshr_32bytes:
+; BE-LABEL: lshr_32bytes_dwordOff:
; BE: # %bb.0:
-; BE-NEXT: ld 6, 0(3)
-; BE-NEXT: ld 7, 8(3)
-; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 24(9)
-; BE-NEXT: std 10, 16(9)
-; BE-NEXT: std 10, 8(9)
-; BE-NEXT: std 10, -64(1)
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: clrlwi 3, 4, 27
+; BE-NEXT: li 6, 0
+; BE-NEXT: std 6, -40(1)
+; BE-NEXT: std 6, -48(1)
+; BE-NEXT: std 6, -56(1)
+; BE-NEXT: std 6, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 3, 27, 28
; BE-NEXT: neg 3, 3
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
-; BE-NEXT: std 6, 32(9)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: std 7, -32(1)
; BE-NEXT: extsw 3, 3
; BE-NEXT: addi 4, 1, -32
; BE-NEXT: ldux 3, 4, 3
@@ -469,7 +1048,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: std 6, 8(5)
; BE-NEXT: blr
;
-; LE-32BIT-LABEL: lshr_32bytes:
+; LE-32BIT-LABEL: lshr_32bytes_dwordOff:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -80(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
@@ -484,7 +1063,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 3, 76(1)
; LE-32BIT-NEXT: addi 3, 1, 48
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: sub 3, 3, 4
; LE-32BIT-NEXT: stw 6, 40(1)
@@ -520,16 +1099,329 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 80
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = lshr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: shl_32bytes:
; LE-64BIT: # %bb.0:
; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: addi 8, 1, -32
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: stxvd2x 2, 7, 6
+; LE-64BIT-NEXT: li 6, 48
+; LE-64BIT-NEXT: rlwinm 3, 4, 0, 27, 28
+; LE-64BIT-NEXT: rlwinm 4, 4, 3, 26, 28
+; LE-64BIT-NEXT: neg 3, 3
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: li 6, 32
+; LE-64BIT-NEXT: extsw 3, 3
+; LE-64BIT-NEXT: stxvd2x 1, 7, 6
+; LE-64BIT-NEXT: stxvd2x 2, 0, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
+; LE-64BIT-NEXT: ldux 3, 8, 3
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
+; LE-64BIT-NEXT: sld 7, 7, 4
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: or 6, 8, 6
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: shl_32bytes:
+; BE: # %bb.0:
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 7, 8(3)
+; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
+; BE-NEXT: std 6, -64(1)
+; BE-NEXT: rlwinm 3, 4, 0, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: sld 6, 6, 4
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
+; BE-NEXT: std 3, 24(5)
+; BE-NEXT: std 7, 16(5)
+; BE-NEXT: std 6, 0(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: shl_32bytes:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 6, 72(1)
+; LE-32BIT-NEXT: stw 6, 68(1)
+; LE-32BIT-NEXT: stw 6, 64(1)
+; LE-32BIT-NEXT: stw 6, 60(1)
+; LE-32BIT-NEXT: stw 6, 56(1)
+; LE-32BIT-NEXT: stw 6, 52(1)
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 40(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 12, 36(1)
+; LE-32BIT-NEXT: subfic 12, 4, 32
+; LE-32BIT-NEXT: stw 11, 32(1)
+; LE-32BIT-NEXT: stw 10, 28(1)
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: lwzux 3, 6, 3
+; LE-32BIT-NEXT: lwz 7, 8(6)
+; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 8, 4(6)
+; LE-32BIT-NEXT: lwz 9, 16(6)
+; LE-32BIT-NEXT: srw 30, 7, 12
+; LE-32BIT-NEXT: lwz 10, 12(6)
+; LE-32BIT-NEXT: slw 29, 8, 4
+; LE-32BIT-NEXT: lwz 11, 24(6)
+; LE-32BIT-NEXT: srw 8, 8, 12
+; LE-32BIT-NEXT: lwz 0, 20(6)
+; LE-32BIT-NEXT: srw 28, 9, 12
+; LE-32BIT-NEXT: lwz 6, 28(6)
+; LE-32BIT-NEXT: slw 27, 10, 4
+; LE-32BIT-NEXT: srw 10, 10, 12
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: srw 26, 11, 12
+; LE-32BIT-NEXT: slw 25, 0, 4
+; LE-32BIT-NEXT: srw 0, 0, 12
+; LE-32BIT-NEXT: slw 9, 9, 4
+; LE-32BIT-NEXT: srw 12, 6, 12
+; LE-32BIT-NEXT: slw 11, 11, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: stw 4, 28(5)
+; LE-32BIT-NEXT: or 4, 11, 12
+; LE-32BIT-NEXT: stw 4, 24(5)
+; LE-32BIT-NEXT: or 4, 9, 0
+; LE-32BIT-NEXT: stw 4, 16(5)
+; LE-32BIT-NEXT: or 4, 25, 26
+; LE-32BIT-NEXT: stw 4, 20(5)
+; LE-32BIT-NEXT: or 4, 7, 10
+; LE-32BIT-NEXT: or 3, 3, 8
+; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: or 4, 27, 28
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
+; LE-64BIT-NEXT: lwz 4, 0(4)
+; LE-64BIT-NEXT: xxlxor 2, 2, 2
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: lxvd2x 1, 0, 3
+; LE-64BIT-NEXT: addi 8, 1, -32
+; LE-64BIT-NEXT: lxvd2x 0, 3, 6
+; LE-64BIT-NEXT: stxvd2x 2, 7, 6
+; LE-64BIT-NEXT: li 6, 48
+; LE-64BIT-NEXT: rlwinm 3, 4, 2, 27, 28
+; LE-64BIT-NEXT: rlwinm 4, 4, 5, 26, 26
+; LE-64BIT-NEXT: neg 3, 3
+; LE-64BIT-NEXT: stxvd2x 0, 7, 6
+; LE-64BIT-NEXT: li 6, 32
+; LE-64BIT-NEXT: extsw 3, 3
+; LE-64BIT-NEXT: stxvd2x 1, 7, 6
+; LE-64BIT-NEXT: stxvd2x 2, 0, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
+; LE-64BIT-NEXT: ldux 3, 8, 3
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
+; LE-64BIT-NEXT: sld 7, 7, 4
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: sld 3, 3, 4
+; LE-64BIT-NEXT: or 6, 8, 6
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: shl_32bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 6, 0(3)
+; BE-NEXT: ld 7, 8(3)
+; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
+; BE-NEXT: std 6, -64(1)
+; BE-NEXT: rlwinm 3, 4, 2, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: rlwinm 4, 4, 5, 26, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: sld 6, 6, 4
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
+; BE-NEXT: sld 3, 3, 4
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
+; BE-NEXT: std 3, 24(5)
+; BE-NEXT: std 7, 16(5)
+; BE-NEXT: std 6, 0(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: shl_32bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: li 6, 0
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 6, 76(1)
+; LE-32BIT-NEXT: stw 6, 72(1)
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT: stw 6, 68(1)
+; LE-32BIT-NEXT: stw 6, 64(1)
+; LE-32BIT-NEXT: stw 6, 60(1)
+; LE-32BIT-NEXT: stw 6, 56(1)
+; LE-32BIT-NEXT: stw 6, 52(1)
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 40(1)
+; LE-32BIT-NEXT: stw 12, 36(1)
+; LE-32BIT-NEXT: stw 11, 32(1)
+; LE-32BIT-NEXT: stw 10, 28(1)
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 7, 16(1)
+; LE-32BIT-NEXT: lwzux 3, 4, 3
+; LE-32BIT-NEXT: lwz 6, 4(4)
+; LE-32BIT-NEXT: lwz 7, 12(4)
+; LE-32BIT-NEXT: lwz 8, 8(4)
+; LE-32BIT-NEXT: lwz 9, 20(4)
+; LE-32BIT-NEXT: lwz 10, 16(4)
+; LE-32BIT-NEXT: lwz 11, 28(4)
+; LE-32BIT-NEXT: lwz 4, 24(4)
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: stw 4, 24(5)
+; LE-32BIT-NEXT: stw 11, 28(5)
+; LE-32BIT-NEXT: stw 10, 16(5)
+; LE-32BIT-NEXT: stw 9, 20(5)
+; LE-32BIT-NEXT: stw 8, 8(5)
+; LE-32BIT-NEXT: stw 7, 12(5)
+; LE-32BIT-NEXT: stw 6, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: shl_32bytes_dwordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: li 6, 16
; LE-64BIT-NEXT: lxvd2x 1, 0, 3
; LE-64BIT-NEXT: xxlxor 2, 2, 2
; LE-64BIT-NEXT: li 7, 48
@@ -537,7 +1429,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lwz 3, 0(4)
; LE-64BIT-NEXT: addi 4, 1, -64
; LE-64BIT-NEXT: stxvd2x 2, 4, 6
-; LE-64BIT-NEXT: clrlwi 3, 3, 27
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28
; LE-64BIT-NEXT: stxvd2x 0, 4, 7
; LE-64BIT-NEXT: li 7, 32
; LE-64BIT-NEXT: neg 3, 3
@@ -552,25 +1444,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: stxvd2x 0, 0, 5
; LE-64BIT-NEXT: blr
;
-; BE-LABEL: shl_32bytes:
+; BE-LABEL: shl_32bytes_dwordOff:
; BE: # %bb.0:
-; BE-NEXT: ld 6, 0(3)
-; BE-NEXT: ld 7, 8(3)
-; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 56(9)
-; BE-NEXT: std 10, 48(9)
-; BE-NEXT: std 10, 40(9)
-; BE-NEXT: std 10, 32(9)
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 8, 16(9)
-; BE-NEXT: std 7, 8(9)
-; BE-NEXT: std 6, -64(1)
-; BE-NEXT: clrldi 3, 4, 59
-; BE-NEXT: ldux 4, 3, 9
+; BE-NEXT: li 6, 0
+; BE-NEXT: std 6, -8(1)
+; BE-NEXT: std 6, -16(1)
+; BE-NEXT: std 6, -24(1)
+; BE-NEXT: std 6, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 8, -56(1)
+; BE-NEXT: std 7, -64(1)
+; BE-NEXT: rlwinm 3, 4, 3, 27, 28
+; BE-NEXT: addi 4, 1, -64
+; BE-NEXT: ldux 4, 3, 4
; BE-NEXT: ld 6, 8(3)
; BE-NEXT: ld 7, 24(3)
; BE-NEXT: ld 3, 16(3)
@@ -580,7 +1472,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; BE-NEXT: std 6, 8(5)
; BE-NEXT: blr
;
-; LE-32BIT-LABEL: shl_32bytes:
+; LE-32BIT-LABEL: shl_32bytes_dwordOff:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -80(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
@@ -595,7 +1487,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 6, 76(1)
; LE-32BIT-NEXT: stw 6, 72(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
; LE-32BIT-NEXT: stw 6, 68(1)
; LE-32BIT-NEXT: stw 6, 64(1)
; LE-32BIT-NEXT: stw 6, 60(1)
@@ -612,87 +1504,403 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 8, 20(1)
; LE-32BIT-NEXT: stw 7, 16(1)
; LE-32BIT-NEXT: lwzux 3, 4, 3
-; LE-32BIT-NEXT: lwz 6, 4(4)
-; LE-32BIT-NEXT: lwz 7, 12(4)
-; LE-32BIT-NEXT: lwz 8, 8(4)
-; LE-32BIT-NEXT: lwz 9, 20(4)
-; LE-32BIT-NEXT: lwz 10, 16(4)
-; LE-32BIT-NEXT: lwz 11, 28(4)
-; LE-32BIT-NEXT: lwz 4, 24(4)
+; LE-32BIT-NEXT: lwz 6, 12(4)
+; LE-32BIT-NEXT: lwz 7, 8(4)
+; LE-32BIT-NEXT: lwz 8, 20(4)
+; LE-32BIT-NEXT: lwz 9, 16(4)
+; LE-32BIT-NEXT: lwz 10, 28(4)
+; LE-32BIT-NEXT: lwz 11, 24(4)
+; LE-32BIT-NEXT: ori 4, 4, 4
+; LE-32BIT-NEXT: lwz 4, 0(4)
; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: stw 4, 24(5)
-; LE-32BIT-NEXT: stw 11, 28(5)
-; LE-32BIT-NEXT: stw 10, 16(5)
-; LE-32BIT-NEXT: stw 9, 20(5)
-; LE-32BIT-NEXT: stw 8, 8(5)
-; LE-32BIT-NEXT: stw 7, 12(5)
-; LE-32BIT-NEXT: stw 6, 4(5)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 10, 28(5)
+; LE-32BIT-NEXT: stw 9, 16(5)
+; LE-32BIT-NEXT: stw 8, 20(5)
+; LE-32BIT-NEXT: stw 7, 8(5)
+; LE-32BIT-NEXT: stw 6, 12(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 80
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = shl i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
+
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_32bytes:
; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: ld 6, 24(3)
; LE-64BIT-NEXT: lxvd2x 0, 0, 3
-; LE-64BIT-NEXT: ld 6, 16(3)
-; LE-64BIT-NEXT: ld 3, 24(3)
+; LE-64BIT-NEXT: lwz 4, 0(4)
; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: ld 3, 16(3)
+; LE-64BIT-NEXT: sradi 8, 6, 63
+; LE-64BIT-NEXT: rlwinm 9, 4, 0, 27, 28
+; LE-64BIT-NEXT: stxvd2x 0, 0, 7
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: rlwinm 3, 4, 3, 26, 28
+; LE-64BIT-NEXT: ldux 4, 9, 7
+; LE-64BIT-NEXT: ld 7, 8(9)
+; LE-64BIT-NEXT: subfic 6, 3, 64
+; LE-64BIT-NEXT: ld 8, 16(9)
+; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 4, 4, 3
+; LE-64BIT-NEXT: sld 10, 7, 6
+; LE-64BIT-NEXT: sld 11, 9, 6
+; LE-64BIT-NEXT: srd 7, 7, 3
+; LE-64BIT-NEXT: sld 6, 8, 6
+; LE-64BIT-NEXT: or 4, 10, 4
+; LE-64BIT-NEXT: srd 10, 8, 3
+; LE-64BIT-NEXT: srad 3, 9, 3
+; LE-64BIT-NEXT: or 6, 6, 7
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: ashr_32bytes:
+; BE: # %bb.0:
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 0, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
+; BE-NEXT: std 3, -64(1)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: rlwinm 4, 4, 3, 26, 28
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srad 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -112(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: addi 6, 1, 48
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: srawi 3, 7, 31
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 0, 27, 29
+; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
+; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: subfic 0, 4, 32
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: stw 3, 36(1)
+; LE-32BIT-NEXT: stw 3, 32(1)
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 30, 6, 4
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: slw 29, 7, 0
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: srw 28, 8, 4
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: slw 27, 9, 0
+; LE-32BIT-NEXT: lwz 12, 28(3)
+; LE-32BIT-NEXT: slw 6, 6, 0
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: srw 26, 10, 4
+; LE-32BIT-NEXT: slw 25, 11, 0
+; LE-32BIT-NEXT: slw 8, 8, 0
+; LE-32BIT-NEXT: slw 10, 10, 0
+; LE-32BIT-NEXT: slw 0, 3, 0
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 12, 12, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 0, 12
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: or 3, 8, 11
+; LE-32BIT-NEXT: srw 9, 9, 4
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: stw 3, 20(5)
+; LE-32BIT-NEXT: or 3, 6, 9
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 27, 28
+; LE-32BIT-NEXT: sraw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
+; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload
+; LE-32BIT-NEXT: addi 1, 1, 112
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes_wordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: ld 6, 24(3)
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: lwz 4, 0(4)
-; LE-64BIT-NEXT: li 8, 16
-; LE-64BIT-NEXT: std 3, 24(7)
-; LE-64BIT-NEXT: sradi 3, 3, 63
-; LE-64BIT-NEXT: std 6, 16(7)
-; LE-64BIT-NEXT: std 3, 56(7)
-; LE-64BIT-NEXT: std 3, 48(7)
-; LE-64BIT-NEXT: std 3, 40(7)
-; LE-64BIT-NEXT: std 3, 32(7)
-; LE-64BIT-NEXT: clrldi 3, 4, 59
+; LE-64BIT-NEXT: addi 7, 1, -64
+; LE-64BIT-NEXT: ld 3, 16(3)
+; LE-64BIT-NEXT: sradi 8, 6, 63
+; LE-64BIT-NEXT: rlwinm 9, 4, 2, 27, 28
; LE-64BIT-NEXT: stxvd2x 0, 0, 7
-; LE-64BIT-NEXT: lxvd2x 0, 7, 3
-; LE-64BIT-NEXT: add 3, 7, 3
-; LE-64BIT-NEXT: lxvd2x 1, 3, 8
-; LE-64BIT-NEXT: stxvd2x 1, 5, 8
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: rlwinm 3, 4, 5, 26, 26
+; LE-64BIT-NEXT: ldux 4, 9, 7
+; LE-64BIT-NEXT: ld 7, 8(9)
+; LE-64BIT-NEXT: subfic 6, 3, 64
+; LE-64BIT-NEXT: ld 8, 16(9)
+; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 4, 4, 3
+; LE-64BIT-NEXT: sld 10, 7, 6
+; LE-64BIT-NEXT: sld 11, 9, 6
+; LE-64BIT-NEXT: srd 7, 7, 3
+; LE-64BIT-NEXT: sld 6, 8, 6
+; LE-64BIT-NEXT: or 4, 10, 4
+; LE-64BIT-NEXT: srd 10, 8, 3
+; LE-64BIT-NEXT: srad 3, 9, 3
+; LE-64BIT-NEXT: or 6, 6, 7
+; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 11, 10
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 4, 0(5)
+; LE-64BIT-NEXT: std 3, 16(5)
+; LE-64BIT-NEXT: blr
+;
+; BE-LABEL: ashr_32bytes_wordOff:
+; BE: # %bb.0:
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
+; BE-NEXT: ld 3, 24(3)
+; BE-NEXT: lwz 4, 28(4)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 2, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
+; BE-NEXT: std 3, -64(1)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: extsw 3, 3
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: rlwinm 4, 4, 5, 26, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
+; BE-NEXT: srad 3, 3, 4
+; BE-NEXT: std 3, 0(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
+; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
+; BE-NEXT: blr
+;
+; LE-32BIT-LABEL: ashr_32bytes_wordOff:
+; LE-32BIT: # %bb.0:
+; LE-32BIT-NEXT: stwu 1, -80(1)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: addi 6, 1, 48
+; LE-32BIT-NEXT: lwz 8, 4(3)
+; LE-32BIT-NEXT: lwz 9, 8(3)
+; LE-32BIT-NEXT: lwz 10, 12(3)
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: lwz 12, 20(3)
+; LE-32BIT-NEXT: lwz 0, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: lwz 4, 28(4)
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: srawi 3, 7, 31
+; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: stw 3, 40(1)
+; LE-32BIT-NEXT: stw 3, 36(1)
+; LE-32BIT-NEXT: stw 3, 32(1)
+; LE-32BIT-NEXT: stw 3, 28(1)
+; LE-32BIT-NEXT: stw 3, 24(1)
+; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
+; LE-32BIT-NEXT: sub 3, 6, 4
+; LE-32BIT-NEXT: lwz 4, 4(3)
+; LE-32BIT-NEXT: lwz 6, 0(3)
+; LE-32BIT-NEXT: lwz 7, 12(3)
+; LE-32BIT-NEXT: lwz 8, 8(3)
+; LE-32BIT-NEXT: lwz 9, 20(3)
+; LE-32BIT-NEXT: lwz 10, 16(3)
+; LE-32BIT-NEXT: lwz 11, 24(3)
+; LE-32BIT-NEXT: lwz 3, 28(3)
+; LE-32BIT-NEXT: stw 11, 24(5)
+; LE-32BIT-NEXT: stw 3, 28(5)
+; LE-32BIT-NEXT: stw 10, 16(5)
+; LE-32BIT-NEXT: stw 9, 20(5)
+; LE-32BIT-NEXT: stw 8, 8(5)
+; LE-32BIT-NEXT: stw 7, 12(5)
+; LE-32BIT-NEXT: stw 6, 0(5)
+; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: addi 1, 1, 80
+; LE-32BIT-NEXT: blr
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; LE-64BIT-LABEL: ashr_32bytes_dwordOff:
+; LE-64BIT: # %bb.0:
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
+; LE-64BIT-NEXT: ld 6, 16(3)
+; LE-64BIT-NEXT: ld 7, 24(3)
+; LE-64BIT-NEXT: lwz 3, 0(4)
+; LE-64BIT-NEXT: addi 4, 1, -64
+; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28
+; LE-64BIT-NEXT: stxvd2x 0, 0, 4
+; LE-64BIT-NEXT: std 6, -48(1)
+; LE-64BIT-NEXT: sradi 6, 7, 63
+; LE-64BIT-NEXT: std 7, -40(1)
+; LE-64BIT-NEXT: std 6, -8(1)
+; LE-64BIT-NEXT: std 6, -16(1)
+; LE-64BIT-NEXT: std 6, -24(1)
+; LE-64BIT-NEXT: std 6, -32(1)
+; LE-64BIT-NEXT: lxvd2x 0, 4, 3
+; LE-64BIT-NEXT: add 3, 4, 3
+; LE-64BIT-NEXT: li 4, 16
+; LE-64BIT-NEXT: lxvd2x 1, 3, 4
+; LE-64BIT-NEXT: stxvd2x 1, 5, 4
; LE-64BIT-NEXT: stxvd2x 0, 0, 5
; LE-64BIT-NEXT: blr
;
-; BE-LABEL: ashr_32bytes:
+; BE-LABEL: ashr_32bytes_dwordOff:
; BE: # %bb.0:
; BE-NEXT: ld 7, 0(3)
; BE-NEXT: ld 8, 8(3)
; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 6, 1, -64
-; BE-NEXT: std 3, 56(6)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
; BE-NEXT: sradi 3, 7, 63
-; BE-NEXT: clrlwi 4, 4, 27
-; BE-NEXT: std 3, 24(6)
-; BE-NEXT: std 3, 16(6)
-; BE-NEXT: std 3, 8(6)
+; BE-NEXT: rlwinm 4, 4, 3, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
; BE-NEXT: std 3, -64(1)
; BE-NEXT: neg 3, 4
-; BE-NEXT: std 9, 48(6)
-; BE-NEXT: std 8, 40(6)
-; BE-NEXT: std 7, 32(6)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
+; BE-NEXT: std 7, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: addi 4, 1, -32
-; BE-NEXT: ldux 3, 4, 3
-; BE-NEXT: ld 6, 8(4)
-; BE-NEXT: ld 7, 24(4)
-; BE-NEXT: ld 4, 16(4)
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: ld 4, 8(6)
+; BE-NEXT: ld 7, 24(6)
+; BE-NEXT: ld 6, 16(6)
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 4, 16(5)
+; BE-NEXT: std 6, 16(5)
; BE-NEXT: std 7, 24(5)
-; BE-NEXT: std 6, 8(5)
+; BE-NEXT: std 4, 8(5)
; BE-NEXT: blr
;
-; LE-32BIT-LABEL: ashr_32bytes:
+; LE-32BIT-LABEL: ashr_32bytes_dwordOff:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -80(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
@@ -707,7 +1915,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 4, 28(4)
; LE-32BIT-NEXT: stw 3, 76(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28
; LE-32BIT-NEXT: stw 0, 72(1)
; LE-32BIT-NEXT: stw 12, 68(1)
; LE-32BIT-NEXT: stw 11, 64(1)
@@ -743,11 +1951,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: addi 1, 1, 80
; LE-32BIT-NEXT: blr
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; LE: {{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
index 044ddf562294..8e69547df6fc 100644
--- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll
@@ -209,45 +209,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
; LE-32BIT-NEXT: li 6, 0
-; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
+; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
; LE-32BIT-NEXT: stw 6, 16(1)
-; LE-32BIT-NEXT: addi 6, 1, 32
-; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 44(1)
-; LE-32BIT-NEXT: sub 6, 6, 7
+; LE-32BIT-NEXT: addi 3, 1, 32
; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: li 3, 7
+; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: nand 3, 4, 3
-; LE-32BIT-NEXT: lwz 7, 4(6)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: lwz 8, 8(6)
-; LE-32BIT-NEXT: subfic 10, 4, 32
-; LE-32BIT-NEXT: lwz 9, 0(6)
-; LE-32BIT-NEXT: clrlwi 3, 3, 27
-; LE-32BIT-NEXT: lwz 6, 12(6)
-; LE-32BIT-NEXT: srw 11, 8, 4
-; LE-32BIT-NEXT: slw 8, 8, 10
-; LE-32BIT-NEXT: slw 10, 9, 10
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: srw 9, 9, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
-; LE-32BIT-NEXT: slw 3, 7, 3
-; LE-32BIT-NEXT: or 6, 8, 6
-; LE-32BIT-NEXT: or 4, 10, 4
-; LE-32BIT-NEXT: or 3, 11, 3
-; LE-32BIT-NEXT: stw 9, 0(5)
-; LE-32BIT-NEXT: stw 6, 12(5)
-; LE-32BIT-NEXT: stw 4, 4(5)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 7, 32(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: or 3, 6, 3
; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: stw 3, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
+; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -304,34 +300,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
; LE-32BIT-NEXT: stw 6, 32(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 31
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 3, 28(1)
; LE-32BIT-NEXT: addi 3, 1, 16
; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: subfic 8, 4, 32
; LE-32BIT-NEXT: stw 7, 16(1)
-; LE-32BIT-NEXT: li 7, 7
; LE-32BIT-NEXT: lwzux 3, 6, 3
-; LE-32BIT-NEXT: nand 7, 4, 7
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: subfic 10, 4, 32
-; LE-32BIT-NEXT: lwz 8, 8(6)
-; LE-32BIT-NEXT: clrlwi 7, 7, 27
; LE-32BIT-NEXT: lwz 9, 4(6)
; LE-32BIT-NEXT: slw 3, 3, 4
+; LE-32BIT-NEXT: lwz 7, 8(6)
; LE-32BIT-NEXT: lwz 6, 12(6)
; LE-32BIT-NEXT: slw 11, 9, 4
-; LE-32BIT-NEXT: srw 9, 9, 10
-; LE-32BIT-NEXT: srw 10, 6, 10
-; LE-32BIT-NEXT: slw 6, 6, 4
-; LE-32BIT-NEXT: slw 4, 8, 4
-; LE-32BIT-NEXT: srwi 8, 8, 1
-; LE-32BIT-NEXT: srw 7, 8, 7
+; LE-32BIT-NEXT: srw 9, 9, 8
+; LE-32BIT-NEXT: srw 10, 7, 8
+; LE-32BIT-NEXT: srw 8, 6, 8
+; LE-32BIT-NEXT: slw 7, 7, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
; LE-32BIT-NEXT: or 3, 3, 9
-; LE-32BIT-NEXT: or 4, 4, 10
+; LE-32BIT-NEXT: stw 4, 12(5)
+; LE-32BIT-NEXT: or 4, 7, 8
; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: or 3, 11, 7
-; LE-32BIT-NEXT: stw 6, 12(5)
+; LE-32BIT-NEXT: or 3, 11, 10
; LE-32BIT-NEXT: stw 4, 8(5)
; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: addi 1, 1, 48
@@ -387,46 +379,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -48(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
-; LE-32BIT-NEXT: li 6, 7
+; LE-32BIT-NEXT: addi 6, 1, 32
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 3, 12(3)
; LE-32BIT-NEXT: lwz 4, 12(4)
; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 8, 36(1)
-; LE-32BIT-NEXT: rlwinm 8, 4, 29, 28, 31
; LE-32BIT-NEXT: stw 7, 32(1)
-; LE-32BIT-NEXT: addi 7, 1, 32
+; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 29
; LE-32BIT-NEXT: stw 9, 40(1)
-; LE-32BIT-NEXT: nand 6, 4, 6
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 8, 36(1)
+; LE-32BIT-NEXT: subfic 9, 4, 32
; LE-32BIT-NEXT: stw 3, 28(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
; LE-32BIT-NEXT: stw 3, 24(1)
-; LE-32BIT-NEXT: subfic 10, 4, 32
; LE-32BIT-NEXT: stw 3, 20(1)
-; LE-32BIT-NEXT: clrlwi 6, 6, 27
; LE-32BIT-NEXT: stw 3, 16(1)
-; LE-32BIT-NEXT: sub 3, 7, 8
-; LE-32BIT-NEXT: lwz 7, 4(3)
-; LE-32BIT-NEXT: lwz 8, 8(3)
-; LE-32BIT-NEXT: lwz 9, 0(3)
-; LE-32BIT-NEXT: lwz 3, 12(3)
-; LE-32BIT-NEXT: srw 11, 8, 4
-; LE-32BIT-NEXT: slw 8, 8, 10
-; LE-32BIT-NEXT: slw 10, 9, 10
+; LE-32BIT-NEXT: sub 3, 6, 7
+; LE-32BIT-NEXT: lwz 6, 4(3)
+; LE-32BIT-NEXT: lwz 7, 0(3)
+; LE-32BIT-NEXT: lwz 8, 12(3)
+; LE-32BIT-NEXT: srw 10, 6, 4
+; LE-32BIT-NEXT: lwz 3, 8(3)
+; LE-32BIT-NEXT: slw 11, 7, 9
+; LE-32BIT-NEXT: slw 6, 6, 9
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 9, 3, 9
; LE-32BIT-NEXT: srw 3, 3, 4
-; LE-32BIT-NEXT: sraw 9, 9, 4
-; LE-32BIT-NEXT: srw 4, 7, 4
-; LE-32BIT-NEXT: slwi 7, 7, 1
-; LE-32BIT-NEXT: or 3, 8, 3
-; LE-32BIT-NEXT: slw 6, 7, 6
+; LE-32BIT-NEXT: or 3, 6, 3
+; LE-32BIT-NEXT: stw 3, 8(5)
+; LE-32BIT-NEXT: or 3, 9, 8
+; LE-32BIT-NEXT: sraw 4, 7, 4
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 10, 4
+; LE-32BIT-NEXT: or 3, 11, 10
+; LE-32BIT-NEXT: stw 4, 0(5)
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 11, 6
-; LE-32BIT-NEXT: stw 9, 0(5)
-; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: addi 1, 1, 48
; LE-32BIT-NEXT: blr
%src = load i128, ptr %src.ptr, align 1
@@ -449,32 +437,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: li 4, 48
; LE-64BIT-NEXT: stxvd2x 2, 7, 4
; LE-64BIT-NEXT: stxvd2x 2, 7, 8
-; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 31
+; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 3, 3, 26
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: stxvd2x 1, 0, 7
-; LE-64BIT-NEXT: li 6, 7
-; LE-64BIT-NEXT: ldux 7, 4, 7
-; LE-64BIT-NEXT: ld 8, 16(4)
-; LE-64BIT-NEXT: nand 6, 3, 6
+; LE-64BIT-NEXT: xori 8, 3, 63
+; LE-64BIT-NEXT: ldux 6, 4, 7
+; LE-64BIT-NEXT: ld 7, 16(4)
; LE-64BIT-NEXT: ld 9, 8(4)
-; LE-64BIT-NEXT: clrlwi 3, 3, 29
; LE-64BIT-NEXT: ld 4, 24(4)
-; LE-64BIT-NEXT: clrlwi 6, 6, 26
+; LE-64BIT-NEXT: srd 6, 6, 3
+; LE-64BIT-NEXT: sldi 11, 7, 1
+; LE-64BIT-NEXT: srd 10, 9, 3
; LE-64BIT-NEXT: srd 7, 7, 3
-; LE-64BIT-NEXT: sldi 10, 8, 1
-; LE-64BIT-NEXT: srd 11, 9, 3
-; LE-64BIT-NEXT: srd 8, 8, 3
-; LE-64BIT-NEXT: sld 6, 10, 6
+; LE-64BIT-NEXT: sld 8, 11, 8
+; LE-64BIT-NEXT: or 8, 10, 8
; LE-64BIT-NEXT: subfic 10, 3, 64
; LE-64BIT-NEXT: srd 3, 4, 3
-; LE-64BIT-NEXT: or 6, 11, 6
; LE-64BIT-NEXT: sld 11, 4, 10
; LE-64BIT-NEXT: sld 9, 9, 10
; LE-64BIT-NEXT: std 3, 24(5)
-; LE-64BIT-NEXT: or 7, 9, 7
-; LE-64BIT-NEXT: or 3, 11, 8
-; LE-64BIT-NEXT: std 6, 8(5)
-; LE-64BIT-NEXT: std 7, 0(5)
+; LE-64BIT-NEXT: std 8, 8(5)
+; LE-64BIT-NEXT: or 6, 9, 6
+; LE-64BIT-NEXT: or 3, 11, 7
+; LE-64BIT-NEXT: std 6, 0(5)
; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
@@ -485,44 +471,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: addi 11, 1, -32
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: rlwinm 3, 4, 29, 27, 31
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -32
+; BE-NEXT: std 9, -40(1)
+; BE-NEXT: std 9, -48(1)
+; BE-NEXT: std 9, -56(1)
+; BE-NEXT: std 9, -64(1)
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: rlwinm 3, 4, 29, 27, 28
; BE-NEXT: neg 3, 3
-; BE-NEXT: std 10, 24(9)
-; BE-NEXT: std 10, 16(9)
-; BE-NEXT: std 10, 8(9)
-; BE-NEXT: std 10, -64(1)
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
-; BE-NEXT: std 6, 32(9)
+; BE-NEXT: std 8, -16(1)
+; BE-NEXT: std 7, -24(1)
+; BE-NEXT: std 6, -32(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: ldux 3, 11, 3
-; BE-NEXT: li 6, 7
-; BE-NEXT: nand 6, 4, 6
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 6, 6, 26
-; BE-NEXT: ld 7, 8(11)
-; BE-NEXT: ld 8, 16(11)
-; BE-NEXT: ld 9, 24(11)
-; BE-NEXT: subfic 10, 4, 64
-; BE-NEXT: sldi 11, 7, 1
-; BE-NEXT: srd 7, 7, 4
-; BE-NEXT: srd 9, 9, 4
-; BE-NEXT: sld 6, 11, 6
-; BE-NEXT: sld 11, 3, 10
-; BE-NEXT: sld 10, 8, 10
-; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: ldux 3, 10, 3
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 6, 8(10)
+; BE-NEXT: ld 7, 24(10)
+; BE-NEXT: ld 8, 16(10)
+; BE-NEXT: sld 10, 3, 9
; BE-NEXT: srd 3, 3, 4
-; BE-NEXT: or 7, 11, 7
-; BE-NEXT: or 6, 8, 6
-; BE-NEXT: or 8, 10, 9
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 8, 24(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: srd 11, 6, 4
+; BE-NEXT: srd 7, 7, 4
+; BE-NEXT: sld 6, 6, 9
+; BE-NEXT: sld 9, 8, 9
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 7, 9, 7
+; BE-NEXT: or 6, 6, 8
; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 7, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: lshr_32bytes:
@@ -538,7 +519,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 0, 24(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 6, 48(1)
; LE-32BIT-NEXT: stw 6, 44(1)
; LE-32BIT-NEXT: stw 6, 40(1)
; LE-32BIT-NEXT: stw 6, 36(1)
@@ -546,68 +526,65 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 28(1)
; LE-32BIT-NEXT: stw 6, 24(1)
; LE-32BIT-NEXT: stw 6, 20(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31
-; LE-32BIT-NEXT: stw 3, 80(1)
-; LE-32BIT-NEXT: addi 3, 1, 52
+; LE-32BIT-NEXT: stw 6, 16(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
+; LE-32BIT-NEXT: stw 3, 76(1)
+; LE-32BIT-NEXT: addi 3, 1, 48
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: sub 3, 3, 6
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 0, 76(1)
-; LE-32BIT-NEXT: stw 12, 72(1)
-; LE-32BIT-NEXT: stw 11, 68(1)
-; LE-32BIT-NEXT: stw 10, 64(1)
-; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: li 9, 7
-; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: nand 9, 4, 9
-; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: lwz 6, 4(3)
; LE-32BIT-NEXT: subfic 30, 4, 32
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: clrlwi 9, 9, 27
-; LE-32BIT-NEXT: lwz 8, 12(3)
-; LE-32BIT-NEXT: slwi 29, 6, 1
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: srw 28, 7, 4
-; LE-32BIT-NEXT: lwz 11, 20(3)
-; LE-32BIT-NEXT: slwi 27, 8, 1
-; LE-32BIT-NEXT: lwz 12, 24(3)
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: xori 12, 4, 31
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: lwz 6, 8(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: srw 29, 6, 4
+; LE-32BIT-NEXT: lwz 9, 12(3)
+; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: srw 27, 9, 4
+; LE-32BIT-NEXT: lwz 0, 28(3)
; LE-32BIT-NEXT: srw 26, 10, 4
-; LE-32BIT-NEXT: lwz 0, 0(3)
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: srw 25, 12, 4
-; LE-32BIT-NEXT: slw 12, 12, 30
-; LE-32BIT-NEXT: slw 7, 7, 30
-; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: slw 25, 11, 30
+; LE-32BIT-NEXT: slw 9, 9, 30
; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 0, 30
-; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 3, 3, 4
; LE-32BIT-NEXT: srw 0, 0, 4
-; LE-32BIT-NEXT: srw 4, 11, 4
-; LE-32BIT-NEXT: or 3, 12, 3
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 30, 0
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 10, 4
-; LE-32BIT-NEXT: slwi 11, 11, 1
+; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: srw 8, 8, 4
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 7, 8
-; LE-32BIT-NEXT: slw 29, 29, 9
-; LE-32BIT-NEXT: slw 27, 27, 9
-; LE-32BIT-NEXT: slw 9, 11, 9
+; LE-32BIT-NEXT: or 3, 6, 27
+; LE-32BIT-NEXT: slw 7, 7, 12
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 30, 6
+; LE-32BIT-NEXT: or 3, 28, 4
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 25, 9
-; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 26, 27
-; LE-32BIT-NEXT: stw 3, 16(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 0, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 7
+; LE-32BIT-NEXT: stw 8, 0(5)
; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
@@ -635,37 +612,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-NEXT: lxvd2x 0, 3, 6
; LE-64BIT-NEXT: stxvd2x 2, 7, 6
; LE-64BIT-NEXT: li 6, 48
-; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 31
+; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 4, 4, 26
; LE-64BIT-NEXT: neg 3, 3
; LE-64BIT-NEXT: stxvd2x 0, 7, 6
; LE-64BIT-NEXT: li 6, 32
; LE-64BIT-NEXT: extsw 3, 3
; LE-64BIT-NEXT: stxvd2x 1, 7, 6
; LE-64BIT-NEXT: stxvd2x 2, 0, 7
-; LE-64BIT-NEXT: li 6, 7
+; LE-64BIT-NEXT: subfic 6, 4, 64
; LE-64BIT-NEXT: ldux 3, 8, 3
-; LE-64BIT-NEXT: ld 7, 8(8)
-; LE-64BIT-NEXT: nand 6, 4, 6
-; LE-64BIT-NEXT: ld 9, 16(8)
-; LE-64BIT-NEXT: clrlwi 4, 4, 29
-; LE-64BIT-NEXT: ld 8, 24(8)
-; LE-64BIT-NEXT: clrlwi 6, 6, 26
-; LE-64BIT-NEXT: rldicl 10, 7, 63, 1
-; LE-64BIT-NEXT: sld 8, 8, 4
+; LE-64BIT-NEXT: ld 7, 16(8)
+; LE-64BIT-NEXT: ld 9, 24(8)
+; LE-64BIT-NEXT: ld 8, 8(8)
+; LE-64BIT-NEXT: srd 10, 7, 6
+; LE-64BIT-NEXT: sld 9, 9, 4
; LE-64BIT-NEXT: sld 7, 7, 4
-; LE-64BIT-NEXT: srd 6, 10, 6
-; LE-64BIT-NEXT: sld 10, 9, 4
-; LE-64BIT-NEXT: or 6, 10, 6
-; LE-64BIT-NEXT: subfic 10, 4, 64
-; LE-64BIT-NEXT: srd 9, 9, 10
-; LE-64BIT-NEXT: srd 10, 3, 10
+; LE-64BIT-NEXT: or 9, 9, 10
+; LE-64BIT-NEXT: srd 10, 8, 6
+; LE-64BIT-NEXT: srd 6, 3, 6
+; LE-64BIT-NEXT: sld 8, 8, 4
; LE-64BIT-NEXT: sld 3, 3, 4
-; LE-64BIT-NEXT: std 6, 16(5)
-; LE-64BIT-NEXT: or 7, 7, 10
+; LE-64BIT-NEXT: or 6, 8, 6
; LE-64BIT-NEXT: std 3, 0(5)
-; LE-64BIT-NEXT: or 3, 8, 9
-; LE-64BIT-NEXT: std 7, 8(5)
-; LE-64BIT-NEXT: std 3, 24(5)
+; LE-64BIT-NEXT: or 3, 7, 10
+; LE-64BIT-NEXT: std 9, 24(5)
+; LE-64BIT-NEXT: std 6, 8(5)
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: shl_32bytes:
@@ -675,41 +648,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; BE-NEXT: ld 8, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: li 10, 0
-; BE-NEXT: std 10, 56(9)
-; BE-NEXT: std 10, 48(9)
-; BE-NEXT: std 10, 40(9)
-; BE-NEXT: std 10, 32(9)
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 8, 16(9)
-; BE-NEXT: std 7, 8(9)
+; BE-NEXT: li 9, 0
+; BE-NEXT: addi 10, 1, -64
+; BE-NEXT: std 9, -8(1)
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 9, -24(1)
+; BE-NEXT: std 9, -32(1)
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 8, -48(1)
+; BE-NEXT: std 7, -56(1)
; BE-NEXT: std 6, -64(1)
-; BE-NEXT: rlwinm 3, 4, 29, 27, 31
-; BE-NEXT: ldux 6, 3, 9
-; BE-NEXT: li 7, 7
-; BE-NEXT: nand 7, 4, 7
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 7, 7, 26
-; BE-NEXT: ld 8, 16(3)
-; BE-NEXT: ld 9, 8(3)
+; BE-NEXT: rlwinm 3, 4, 29, 27, 28
+; BE-NEXT: ldux 6, 3, 10
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 16(3)
+; BE-NEXT: ld 8, 8(3)
; BE-NEXT: ld 3, 24(3)
-; BE-NEXT: subfic 10, 4, 64
; BE-NEXT: sld 6, 6, 4
-; BE-NEXT: rldicl 11, 8, 63, 1
-; BE-NEXT: sld 8, 8, 4
-; BE-NEXT: srd 7, 11, 7
-; BE-NEXT: srd 11, 9, 10
-; BE-NEXT: sld 9, 9, 4
-; BE-NEXT: srd 10, 3, 10
+; BE-NEXT: srd 10, 7, 9
+; BE-NEXT: sld 11, 8, 4
+; BE-NEXT: srd 8, 8, 9
+; BE-NEXT: srd 9, 3, 9
+; BE-NEXT: sld 7, 7, 4
; BE-NEXT: sld 3, 3, 4
-; BE-NEXT: or 6, 6, 11
-; BE-NEXT: or 7, 9, 7
-; BE-NEXT: or 8, 8, 10
+; BE-NEXT: or 10, 11, 10
+; BE-NEXT: or 6, 6, 8
+; BE-NEXT: or 7, 7, 9
; BE-NEXT: std 3, 24(5)
-; BE-NEXT: std 8, 16(5)
+; BE-NEXT: std 7, 16(5)
; BE-NEXT: std 6, 0(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: shl_32bytes:
@@ -731,7 +700,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 6, 80(1)
; LE-32BIT-NEXT: stw 6, 76(1)
; LE-32BIT-NEXT: stw 6, 72(1)
; LE-32BIT-NEXT: stw 6, 68(1)
@@ -739,61 +707,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: stw 6, 60(1)
; LE-32BIT-NEXT: stw 6, 56(1)
; LE-32BIT-NEXT: stw 6, 52(1)
-; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31
-; LE-32BIT-NEXT: stw 3, 48(1)
-; LE-32BIT-NEXT: addi 3, 1, 20
-; LE-32BIT-NEXT: stw 0, 44(1)
-; LE-32BIT-NEXT: stw 12, 40(1)
-; LE-32BIT-NEXT: stw 11, 36(1)
-; LE-32BIT-NEXT: stw 10, 32(1)
-; LE-32BIT-NEXT: stw 9, 28(1)
-; LE-32BIT-NEXT: stw 8, 24(1)
-; LE-32BIT-NEXT: li 8, 7
-; LE-32BIT-NEXT: stw 7, 20(1)
-; LE-32BIT-NEXT: nand 8, 4, 8
+; LE-32BIT-NEXT: stw 6, 48(1)
+; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29
+; LE-32BIT-NEXT: stw 3, 44(1)
+; LE-32BIT-NEXT: addi 3, 1, 16
+; LE-32BIT-NEXT: stw 0, 40(1)
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
+; LE-32BIT-NEXT: stw 12, 36(1)
+; LE-32BIT-NEXT: subfic 12, 4, 32
+; LE-32BIT-NEXT: stw 11, 32(1)
+; LE-32BIT-NEXT: stw 10, 28(1)
+; LE-32BIT-NEXT: stw 9, 24(1)
+; LE-32BIT-NEXT: stw 8, 20(1)
+; LE-32BIT-NEXT: stw 7, 16(1)
; LE-32BIT-NEXT: lwzux 3, 6, 3
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: subfic 0, 4, 32
-; LE-32BIT-NEXT: clrlwi 8, 8, 27
; LE-32BIT-NEXT: lwz 7, 8(6)
; LE-32BIT-NEXT: slw 3, 3, 4
-; LE-32BIT-NEXT: lwz 9, 4(6)
-; LE-32BIT-NEXT: lwz 10, 16(6)
-; LE-32BIT-NEXT: srwi 29, 7, 1
-; LE-32BIT-NEXT: lwz 11, 12(6)
-; LE-32BIT-NEXT: slw 28, 9, 4
-; LE-32BIT-NEXT: lwz 12, 24(6)
-; LE-32BIT-NEXT: srwi 27, 10, 1
-; LE-32BIT-NEXT: lwz 30, 20(6)
-; LE-32BIT-NEXT: slw 26, 11, 4
+; LE-32BIT-NEXT: lwz 8, 4(6)
+; LE-32BIT-NEXT: lwz 9, 16(6)
+; LE-32BIT-NEXT: srw 30, 7, 12
+; LE-32BIT-NEXT: lwz 10, 12(6)
+; LE-32BIT-NEXT: slw 29, 8, 4
+; LE-32BIT-NEXT: lwz 11, 24(6)
+; LE-32BIT-NEXT: srw 8, 8, 12
+; LE-32BIT-NEXT: lwz 0, 20(6)
+; LE-32BIT-NEXT: srw 28, 9, 12
; LE-32BIT-NEXT: lwz 6, 28(6)
-; LE-32BIT-NEXT: srw 9, 9, 0
-; LE-32BIT-NEXT: slw 25, 30, 4
-; LE-32BIT-NEXT: srw 11, 11, 0
+; LE-32BIT-NEXT: slw 27, 10, 4
+; LE-32BIT-NEXT: srw 10, 10, 12
; LE-32BIT-NEXT: slw 7, 7, 4
-; LE-32BIT-NEXT: srw 30, 30, 0
-; LE-32BIT-NEXT: slw 10, 10, 4
-; LE-32BIT-NEXT: srw 0, 6, 0
-; LE-32BIT-NEXT: slw 6, 6, 4
-; LE-32BIT-NEXT: slw 4, 12, 4
-; LE-32BIT-NEXT: srwi 12, 12, 1
-; LE-32BIT-NEXT: srw 29, 29, 8
-; LE-32BIT-NEXT: srw 27, 27, 8
-; LE-32BIT-NEXT: srw 8, 12, 8
-; LE-32BIT-NEXT: or 3, 3, 9
-; LE-32BIT-NEXT: or 4, 4, 0
-; LE-32BIT-NEXT: stw 3, 0(5)
-; LE-32BIT-NEXT: or 3, 25, 8
+; LE-32BIT-NEXT: srw 26, 11, 12
+; LE-32BIT-NEXT: slw 25, 0, 4
+; LE-32BIT-NEXT: srw 0, 0, 12
+; LE-32BIT-NEXT: slw 9, 9, 4
+; LE-32BIT-NEXT: srw 12, 6, 12
+; LE-32BIT-NEXT: slw 11, 11, 4
+; LE-32BIT-NEXT: slw 4, 6, 4
+; LE-32BIT-NEXT: stw 4, 28(5)
+; LE-32BIT-NEXT: or 4, 11, 12
; LE-32BIT-NEXT: stw 4, 24(5)
-; LE-32BIT-NEXT: or 4, 10, 30
-; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 26, 27
+; LE-32BIT-NEXT: or 4, 9, 0
; LE-32BIT-NEXT: stw 4, 16(5)
-; LE-32BIT-NEXT: or 4, 7, 11
-; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 6, 28(5)
+; LE-32BIT-NEXT: or 4, 25, 26
+; LE-32BIT-NEXT: stw 4, 20(5)
+; LE-32BIT-NEXT: or 4, 7, 10
+; LE-32BIT-NEXT: or 3, 3, 8
; LE-32BIT-NEXT: stw 4, 8(5)
+; LE-32BIT-NEXT: or 4, 27, 28
+; LE-32BIT-NEXT: stw 3, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 30
+; LE-32BIT-NEXT: stw 4, 12(5)
; LE-32BIT-NEXT: stw 3, 4(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
@@ -812,98 +775,91 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-64BIT-LABEL: ashr_32bytes:
; LE-64BIT: # %bb.0:
-; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: ld 6, 24(3)
+; LE-64BIT-NEXT: lxvd2x 0, 0, 3
; LE-64BIT-NEXT: lwz 4, 0(4)
; LE-64BIT-NEXT: addi 7, 1, -64
; LE-64BIT-NEXT: ld 3, 16(3)
; LE-64BIT-NEXT: sradi 8, 6, 63
-; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 31
-; LE-64BIT-NEXT: std 6, 24(7)
-; LE-64BIT-NEXT: std 3, 16(7)
-; LE-64BIT-NEXT: li 3, 7
-; LE-64BIT-NEXT: std 8, 56(7)
-; LE-64BIT-NEXT: std 8, 48(7)
-; LE-64BIT-NEXT: std 8, 40(7)
-; LE-64BIT-NEXT: std 8, 32(7)
+; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 28
+; LE-64BIT-NEXT: clrlwi 4, 4, 26
; LE-64BIT-NEXT: stxvd2x 0, 0, 7
-; LE-64BIT-NEXT: nand 3, 4, 3
-; LE-64BIT-NEXT: clrlwi 4, 4, 29
-; LE-64BIT-NEXT: ldux 6, 9, 7
-; LE-64BIT-NEXT: ld 7, 16(9)
+; LE-64BIT-NEXT: std 6, -40(1)
+; LE-64BIT-NEXT: std 3, -48(1)
+; LE-64BIT-NEXT: std 8, -8(1)
+; LE-64BIT-NEXT: std 8, -16(1)
+; LE-64BIT-NEXT: std 8, -24(1)
+; LE-64BIT-NEXT: std 8, -32(1)
+; LE-64BIT-NEXT: ldux 3, 9, 7
+; LE-64BIT-NEXT: xori 7, 4, 63
+; LE-64BIT-NEXT: ld 6, 16(9)
; LE-64BIT-NEXT: ld 8, 8(9)
-; LE-64BIT-NEXT: clrlwi 3, 3, 26
; LE-64BIT-NEXT: ld 9, 24(9)
+; LE-64BIT-NEXT: srd 3, 3, 4
+; LE-64BIT-NEXT: sldi 11, 6, 1
+; LE-64BIT-NEXT: srd 10, 8, 4
; LE-64BIT-NEXT: srd 6, 6, 4
-; LE-64BIT-NEXT: sldi 10, 7, 1
-; LE-64BIT-NEXT: srd 11, 8, 4
-; LE-64BIT-NEXT: srd 7, 7, 4
-; LE-64BIT-NEXT: sld 3, 10, 3
+; LE-64BIT-NEXT: sld 7, 11, 7
+; LE-64BIT-NEXT: or 7, 10, 7
; LE-64BIT-NEXT: subfic 10, 4, 64
; LE-64BIT-NEXT: srad 4, 9, 4
-; LE-64BIT-NEXT: or 3, 11, 3
-; LE-64BIT-NEXT: sld 11, 9, 10
; LE-64BIT-NEXT: sld 8, 8, 10
+; LE-64BIT-NEXT: sld 11, 9, 10
; LE-64BIT-NEXT: std 4, 24(5)
-; LE-64BIT-NEXT: or 6, 8, 6
-; LE-64BIT-NEXT: or 4, 11, 7
-; LE-64BIT-NEXT: std 3, 8(5)
-; LE-64BIT-NEXT: std 6, 0(5)
-; LE-64BIT-NEXT: std 4, 16(5)
+; LE-64BIT-NEXT: std 7, 8(5)
+; LE-64BIT-NEXT: or 3, 8, 3
+; LE-64BIT-NEXT: std 3, 0(5)
+; LE-64BIT-NEXT: or 3, 11, 6
+; LE-64BIT-NEXT: std 3, 16(5)
; LE-64BIT-NEXT: blr
;
; BE-LABEL: ashr_32bytes:
; BE: # %bb.0:
-; BE-NEXT: ld 6, 0(3)
-; BE-NEXT: ld 7, 8(3)
-; BE-NEXT: ld 8, 16(3)
+; BE-NEXT: ld 7, 0(3)
+; BE-NEXT: ld 8, 8(3)
+; BE-NEXT: ld 9, 16(3)
; BE-NEXT: ld 3, 24(3)
; BE-NEXT: lwz 4, 28(4)
-; BE-NEXT: addi 9, 1, -64
-; BE-NEXT: addi 10, 1, -32
-; BE-NEXT: std 3, 56(9)
-; BE-NEXT: std 6, 32(9)
-; BE-NEXT: sradi 3, 6, 63
-; BE-NEXT: rlwinm 6, 4, 29, 27, 31
-; BE-NEXT: std 3, 24(9)
-; BE-NEXT: std 3, 16(9)
-; BE-NEXT: std 3, 8(9)
+; BE-NEXT: addi 6, 1, -32
+; BE-NEXT: std 3, -8(1)
+; BE-NEXT: std 7, -32(1)
+; BE-NEXT: sradi 3, 7, 63
+; BE-NEXT: rlwinm 7, 4, 29, 27, 28
+; BE-NEXT: std 3, -40(1)
+; BE-NEXT: std 3, -48(1)
+; BE-NEXT: std 3, -56(1)
; BE-NEXT: std 3, -64(1)
-; BE-NEXT: neg 3, 6
-; BE-NEXT: std 8, 48(9)
-; BE-NEXT: std 7, 40(9)
+; BE-NEXT: neg 3, 7
+; BE-NEXT: std 9, -16(1)
+; BE-NEXT: std 8, -24(1)
; BE-NEXT: extsw 3, 3
-; BE-NEXT: ldux 3, 10, 3
-; BE-NEXT: li 6, 7
-; BE-NEXT: nand 6, 4, 6
-; BE-NEXT: clrlwi 4, 4, 29
-; BE-NEXT: clrlwi 6, 6, 26
-; BE-NEXT: ld 7, 8(10)
-; BE-NEXT: ld 8, 16(10)
-; BE-NEXT: ld 9, 24(10)
-; BE-NEXT: subfic 10, 4, 64
-; BE-NEXT: sldi 11, 7, 1
-; BE-NEXT: srd 7, 7, 4
-; BE-NEXT: srd 9, 9, 4
-; BE-NEXT: sld 6, 11, 6
-; BE-NEXT: sld 11, 3, 10
-; BE-NEXT: sld 10, 8, 10
-; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: ldux 3, 6, 3
+; BE-NEXT: clrlwi 4, 4, 26
+; BE-NEXT: subfic 9, 4, 64
+; BE-NEXT: ld 7, 8(6)
+; BE-NEXT: ld 8, 24(6)
+; BE-NEXT: ld 6, 16(6)
+; BE-NEXT: sld 10, 3, 9
; BE-NEXT: srad 3, 3, 4
-; BE-NEXT: or 7, 11, 7
-; BE-NEXT: or 6, 8, 6
-; BE-NEXT: or 8, 10, 9
; BE-NEXT: std 3, 0(5)
-; BE-NEXT: std 8, 24(5)
-; BE-NEXT: std 7, 8(5)
+; BE-NEXT: srd 11, 7, 4
+; BE-NEXT: srd 8, 8, 4
+; BE-NEXT: sld 7, 7, 9
+; BE-NEXT: sld 9, 6, 9
+; BE-NEXT: srd 6, 6, 4
+; BE-NEXT: or 10, 10, 11
+; BE-NEXT: or 8, 9, 8
+; BE-NEXT: or 6, 7, 6
; BE-NEXT: std 6, 16(5)
+; BE-NEXT: std 8, 24(5)
+; BE-NEXT: std 10, 8(5)
; BE-NEXT: blr
;
; LE-32BIT-LABEL: ashr_32bytes:
; LE-32BIT: # %bb.0:
; LE-32BIT-NEXT: stwu 1, -112(1)
; LE-32BIT-NEXT: lwz 7, 0(3)
-; LE-32BIT-NEXT: addi 6, 1, 52
+; LE-32BIT-NEXT: addi 6, 1, 48
; LE-32BIT-NEXT: lwz 8, 4(3)
; LE-32BIT-NEXT: lwz 9, 8(3)
; LE-32BIT-NEXT: lwz 10, 12(3)
@@ -912,76 +868,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; LE-32BIT-NEXT: lwz 0, 24(3)
; LE-32BIT-NEXT: lwz 3, 28(3)
; LE-32BIT-NEXT: lwz 4, 28(4)
-; LE-32BIT-NEXT: stw 3, 80(1)
+; LE-32BIT-NEXT: stw 3, 76(1)
; LE-32BIT-NEXT: srawi 3, 7, 31
-; LE-32BIT-NEXT: stw 7, 52(1)
-; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 31
+; LE-32BIT-NEXT: stw 7, 48(1)
+; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 29
; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill
+; LE-32BIT-NEXT: clrlwi 4, 4, 27
; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill
; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill
-; LE-32BIT-NEXT: stw 0, 76(1)
-; LE-32BIT-NEXT: stw 12, 72(1)
-; LE-32BIT-NEXT: stw 11, 68(1)
-; LE-32BIT-NEXT: stw 10, 64(1)
-; LE-32BIT-NEXT: stw 9, 60(1)
-; LE-32BIT-NEXT: li 9, 7
-; LE-32BIT-NEXT: stw 8, 56(1)
-; LE-32BIT-NEXT: nand 9, 4, 9
-; LE-32BIT-NEXT: stw 3, 48(1)
-; LE-32BIT-NEXT: clrlwi 4, 4, 29
-; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: subfic 30, 4, 32
+; LE-32BIT-NEXT: stw 0, 72(1)
+; LE-32BIT-NEXT: stw 12, 68(1)
+; LE-32BIT-NEXT: xori 12, 4, 31
+; LE-32BIT-NEXT: stw 11, 64(1)
+; LE-32BIT-NEXT: stw 10, 60(1)
+; LE-32BIT-NEXT: stw 9, 56(1)
+; LE-32BIT-NEXT: stw 8, 52(1)
+; LE-32BIT-NEXT: stw 3, 44(1)
; LE-32BIT-NEXT: stw 3, 40(1)
-; LE-32BIT-NEXT: clrlwi 9, 9, 27
; LE-32BIT-NEXT: stw 3, 36(1)
; LE-32BIT-NEXT: stw 3, 32(1)
; LE-32BIT-NEXT: stw 3, 28(1)
; LE-32BIT-NEXT: stw 3, 24(1)
; LE-32BIT-NEXT: stw 3, 20(1)
+; LE-32BIT-NEXT: stw 3, 16(1)
; LE-32BIT-NEXT: sub 3, 6, 7
-; LE-32BIT-NEXT: lwz 6, 4(3)
-; LE-32BIT-NEXT: lwz 7, 8(3)
-; LE-32BIT-NEXT: lwz 8, 12(3)
-; LE-32BIT-NEXT: slwi 29, 6, 1
-; LE-32BIT-NEXT: lwz 10, 16(3)
-; LE-32BIT-NEXT: srw 28, 7, 4
-; LE-32BIT-NEXT: lwz 11, 20(3)
-; LE-32BIT-NEXT: slwi 27, 8, 1
-; LE-32BIT-NEXT: lwz 12, 24(3)
+; LE-32BIT-NEXT: lwz 6, 8(3)
+; LE-32BIT-NEXT: lwz 7, 4(3)
+; LE-32BIT-NEXT: lwz 8, 0(3)
+; LE-32BIT-NEXT: srw 29, 6, 4
+; LE-32BIT-NEXT: lwz 9, 12(3)
+; LE-32BIT-NEXT: slw 6, 6, 30
+; LE-32BIT-NEXT: lwz 10, 20(3)
+; LE-32BIT-NEXT: slw 28, 8, 30
+; LE-32BIT-NEXT: lwz 11, 16(3)
+; LE-32BIT-NEXT: srw 27, 9, 4
+; LE-32BIT-NEXT: lwz 0, 28(3)
; LE-32BIT-NEXT: srw 26, 10, 4
-; LE-32BIT-NEXT: lwz 0, 0(3)
-; LE-32BIT-NEXT: srw 6, 6, 4
-; LE-32BIT-NEXT: lwz 3, 28(3)
-; LE-32BIT-NEXT: srw 25, 12, 4
-; LE-32BIT-NEXT: slw 12, 12, 30
-; LE-32BIT-NEXT: slw 7, 7, 30
-; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: lwz 3, 24(3)
+; LE-32BIT-NEXT: slw 25, 11, 30
+; LE-32BIT-NEXT: slw 9, 9, 30
; LE-32BIT-NEXT: slw 10, 10, 30
-; LE-32BIT-NEXT: slw 30, 0, 30
-; LE-32BIT-NEXT: srw 8, 8, 4
-; LE-32BIT-NEXT: sraw 0, 0, 4
-; LE-32BIT-NEXT: srw 4, 11, 4
-; LE-32BIT-NEXT: or 3, 12, 3
+; LE-32BIT-NEXT: slw 30, 3, 30
+; LE-32BIT-NEXT: srw 3, 3, 4
+; LE-32BIT-NEXT: srw 0, 0, 4
+; LE-32BIT-NEXT: or 3, 10, 3
+; LE-32BIT-NEXT: srw 11, 11, 4
+; LE-32BIT-NEXT: stw 3, 24(5)
+; LE-32BIT-NEXT: or 3, 30, 0
; LE-32BIT-NEXT: stw 3, 28(5)
-; LE-32BIT-NEXT: or 3, 10, 4
-; LE-32BIT-NEXT: slwi 11, 11, 1
+; LE-32BIT-NEXT: or 3, 9, 11
+; LE-32BIT-NEXT: stw 3, 16(5)
+; LE-32BIT-NEXT: or 3, 25, 26
+; LE-32BIT-NEXT: sraw 8, 8, 4
+; LE-32BIT-NEXT: srw 4, 7, 4
+; LE-32BIT-NEXT: slwi 7, 7, 1
; LE-32BIT-NEXT: stw 3, 20(5)
-; LE-32BIT-NEXT: or 3, 7, 8
-; LE-32BIT-NEXT: slw 29, 29, 9
-; LE-32BIT-NEXT: slw 27, 27, 9
-; LE-32BIT-NEXT: slw 9, 11, 9
+; LE-32BIT-NEXT: or 3, 6, 27
+; LE-32BIT-NEXT: slw 7, 7, 12
; LE-32BIT-NEXT: stw 3, 12(5)
-; LE-32BIT-NEXT: or 3, 30, 6
+; LE-32BIT-NEXT: or 3, 28, 4
; LE-32BIT-NEXT: stw 3, 4(5)
-; LE-32BIT-NEXT: or 3, 25, 9
-; LE-32BIT-NEXT: stw 3, 24(5)
-; LE-32BIT-NEXT: or 3, 26, 27
-; LE-32BIT-NEXT: stw 3, 16(5)
-; LE-32BIT-NEXT: or 3, 28, 29
-; LE-32BIT-NEXT: stw 0, 0(5)
+; LE-32BIT-NEXT: or 3, 29, 7
+; LE-32BIT-NEXT: stw 8, 0(5)
; LE-32BIT-NEXT: stw 3, 8(5)
; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload
; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index f61cbfd3ed72..5ba8755201dd 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb a1, 12(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: sb a4, 4(sp)
-; RV32I-NEXT: sb a3, 0(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 14(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 13(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 11(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 10(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 9(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 7(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 6(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 3(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 2(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 1(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a1, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 8(a1)
; RV32I-NEXT: lw a5, 4(a1)
; RV32I-NEXT: lw a1, 0(a1)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: sb a4, 8(sp)
-; RV32I-NEXT: sb a5, 4(sp)
-; RV32I-NEXT: sb a1, 0(sp)
-; RV32I-NEXT: srai a6, a3, 31
-; RV32I-NEXT: sb a6, 28(sp)
-; RV32I-NEXT: sb a6, 24(sp)
-; RV32I-NEXT: sb a6, 20(sp)
-; RV32I-NEXT: sb a6, 16(sp)
-; RV32I-NEXT: srli a7, a3, 24
-; RV32I-NEXT: sb a7, 15(sp)
-; RV32I-NEXT: srli a7, a3, 16
-; RV32I-NEXT: sb a7, 14(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(sp)
-; RV32I-NEXT: srli a3, a4, 24
-; RV32I-NEXT: sb a3, 11(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 10(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 9(sp)
-; RV32I-NEXT: srli a3, a5, 24
-; RV32I-NEXT: sb a3, 7(sp)
-; RV32I-NEXT: srli a3, a5, 16
-; RV32I-NEXT: sb a3, 6(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 5(sp)
-; RV32I-NEXT: srli a3, a1, 24
-; RV32I-NEXT: sb a3, 3(sp)
-; RV32I-NEXT: srli a3, a1, 16
-; RV32I-NEXT: sb a3, 2(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(sp)
-; RV32I-NEXT: srli a1, a6, 24
-; RV32I-NEXT: sb a1, 31(sp)
-; RV32I-NEXT: srli a3, a6, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw a3, 12(sp)
+; RV32I-NEXT: sw a4, 8(sp)
+; RV32I-NEXT: sw a5, 4(sp)
+; RV32I-NEXT: sw a1, 0(sp)
+; RV32I-NEXT: srai a3, a3, 31
+; RV32I-NEXT: sw a3, 28(sp)
+; RV32I-NEXT: sw a3, 24(sp)
+; RV32I-NEXT: sw a3, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: mv a3, sp
; RV32I-NEXT: add a1, a3, a1
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu a4, 0(a1)
-; RV32I-NEXT: lbu a5, 2(a1)
-; RV32I-NEXT: lbu a6, 3(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
+; RV32I-NEXT: lw a3, 0(a1)
+; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: srl a3, a3, a2
-; RV32I-NEXT: lbu a4, 5(a1)
-; RV32I-NEXT: lbu a5, 4(a1)
-; RV32I-NEXT: lbu a6, 6(a1)
-; RV32I-NEXT: lbu a7, 7(a1)
-; RV32I-NEXT: slli a4, a4, 8
-; RV32I-NEXT: or a4, a4, a5
-; RV32I-NEXT: slli a6, a6, 16
-; RV32I-NEXT: slli a7, a7, 24
-; RV32I-NEXT: or a5, a7, a6
-; RV32I-NEXT: or a4, a5, a4
; RV32I-NEXT: slli a5, a4, 1
-; RV32I-NEXT: xori a6, a2, 31
+; RV32I-NEXT: andi a6, a2, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 8(a1)
; RV32I-NEXT: sll a5, a5, a6
; RV32I-NEXT: or a3, a3, a5
; RV32I-NEXT: srl a4, a4, a2
-; RV32I-NEXT: lbu a5, 9(a1)
-; RV32I-NEXT: lbu a7, 8(a1)
-; RV32I-NEXT: lbu t0, 10(a1)
-; RV32I-NEXT: lbu t1, 11(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a7, t1, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: slli a7, a5, 1
-; RV32I-NEXT: not t0, a2
-; RV32I-NEXT: lbu t1, 13(a1)
-; RV32I-NEXT: sll a7, a7, t0
-; RV32I-NEXT: or a4, a4, a7
-; RV32I-NEXT: lbu a7, 12(a1)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: lbu t0, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: or a7, t1, a7
-; RV32I-NEXT: srl a5, a5, a2
-; RV32I-NEXT: slli t0, t0, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: slli a5, a7, 1
+; RV32I-NEXT: lw a1, 12(a1)
+; RV32I-NEXT: sll a5, a5, a6
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: srl a5, a7, a2
; RV32I-NEXT: slli a7, a1, 1
; RV32I-NEXT: sll a6, a7, a6
; RV32I-NEXT: or a5, a5, a6
@@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
; RV32I-NEXT: lw a4, 4(a1)
; RV32I-NEXT: lw a5, 8(a1)
; RV32I-NEXT: lw a1, 12(a1)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb zero, 3(sp)
-; RV32I-NEXT: sb zero, 2(sp)
-; RV32I-NEXT: sb zero, 1(sp)
-; RV32I-NEXT: sb zero, 0(sp)
-; RV32I-NEXT: sb a1, 28(sp)
-; RV32I-NEXT: sb a5, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a3, 16(sp)
-; RV32I-NEXT: srli a6, a1, 24
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: srli a6, a1, 16
-; RV32I-NEXT: sb a6, 30(sp)
-; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 29(sp)
-; RV32I-NEXT: srli a1, a5, 24
-; RV32I-NEXT: sb a1, 27(sp)
-; RV32I-NEXT: srli a1, a5, 16
-; RV32I-NEXT: sb a1, 26(sp)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 25(sp)
-; RV32I-NEXT: srli a1, a4, 24
-; RV32I-NEXT: sb a1, 23(sp)
-; RV32I-NEXT: srli a1, a4, 16
-; RV32I-NEXT: sb a1, 22(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: srli a1, a3, 24
-; RV32I-NEXT: sb a1, 19(sp)
-; RV32I-NEXT: srli a1, a3, 16
-; RV32I-NEXT: sb a1, 18(sp)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 17(sp)
-; RV32I-NEXT: slli a1, a2, 25
-; RV32I-NEXT: srli a1, a1, 28
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a1, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a1, a2, 3
+; RV32I-NEXT: andi a1, a1, 12
; RV32I-NEXT: addi a3, sp, 16
-; RV32I-NEXT: sub a1, a3, a1
-; RV32I-NEXT: lbu a3, 5(a1)
-; RV32I-NEXT: lbu a4, 4(a1)
-; RV32I-NEXT: lbu a5, 6(a1)
-; RV32I-NEXT: lbu a6, 7(a1)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a4
-; RV32I-NEXT: slli a5, a5, 16
-; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a3, a4, a3
-; RV32I-NEXT: andi a2, a2, 7
-; RV32I-NEXT: sll a4, a3, a2
-; RV32I-NEXT: lbu a5, 1(a1)
-; RV32I-NEXT: lbu a6, 0(a1)
-; RV32I-NEXT: lbu a7, 2(a1)
-; RV32I-NEXT: lbu t0, 3(a1)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a6
-; RV32I-NEXT: slli a7, a7, 16
-; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a5, a6, a5
-; RV32I-NEXT: srli a6, a5, 1
-; RV32I-NEXT: xori a7, a2, 31
+; RV32I-NEXT: sub a3, a3, a1
+; RV32I-NEXT: lw a1, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a1, a2
+; RV32I-NEXT: srli a6, a4, 1
+; RV32I-NEXT: andi a7, a2, 31
+; RV32I-NEXT: lw t0, 8(a3)
+; RV32I-NEXT: xori a7, a7, 31
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a4, a4, a6
-; RV32I-NEXT: lbu a6, 9(a1)
-; RV32I-NEXT: lbu t0, 8(a1)
-; RV32I-NEXT: lbu t1, 10(a1)
-; RV32I-NEXT: lbu t2, 11(a1)
-; RV32I-NEXT: slli a6, a6, 8
-; RV32I-NEXT: or a6, a6, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a6, t0, a6
-; RV32I-NEXT: sll t0, a6, a2
-; RV32I-NEXT: srli a3, a3, 1
-; RV32I-NEXT: not t1, a2
-; RV32I-NEXT: srl a3, a3, t1
-; RV32I-NEXT: or a3, t0, a3
-; RV32I-NEXT: lbu t0, 13(a1)
-; RV32I-NEXT: lbu t1, 12(a1)
-; RV32I-NEXT: lbu t2, 14(a1)
-; RV32I-NEXT: lbu a1, 15(a1)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, t2
-; RV32I-NEXT: or a1, a1, t0
-; RV32I-NEXT: sll a1, a1, a2
-; RV32I-NEXT: srli a6, a6, 1
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sll a6, t0, a2
+; RV32I-NEXT: lw a3, 12(a3)
+; RV32I-NEXT: srli a1, a1, 1
+; RV32I-NEXT: srl a1, a1, a7
+; RV32I-NEXT: or a1, a6, a1
+; RV32I-NEXT: sll a3, a3, a2
+; RV32I-NEXT: srli a6, t0, 1
; RV32I-NEXT: srl a6, a6, a7
-; RV32I-NEXT: or a1, a1, a6
-; RV32I-NEXT: sll a2, a5, a2
+; RV32I-NEXT: or a3, a3, a6
+; RV32I-NEXT: sll a2, a4, a2
; RV32I-NEXT: sw a2, 0(a0)
-; RV32I-NEXT: sw a1, 12(a0)
-; RV32I-NEXT: sw a3, 8(a0)
-; RV32I-NEXT: sw a4, 4(a0)
+; RV32I-NEXT: sw a3, 12(a0)
+; RV32I-NEXT: sw a1, 8(a0)
+; RV32I-NEXT: sw a5, 4(a0)
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index b0d435368e92..29fe0a7de6b3 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb a0, 19(sp)
-; RV32I-NEXT: sb s2, 18(sp)
-; RV32I-NEXT: sb s1, 17(sp)
-; RV32I-NEXT: sb s0, 16(sp)
-; RV32I-NEXT: sb t6, 15(sp)
-; RV32I-NEXT: sb t5, 14(sp)
-; RV32I-NEXT: sb t4, 13(sp)
-; RV32I-NEXT: sb t3, 12(sp)
-; RV32I-NEXT: sb t2, 11(sp)
-; RV32I-NEXT: sb t1, 10(sp)
-; RV32I-NEXT: sb t0, 9(sp)
-; RV32I-NEXT: sb a7, 8(sp)
-; RV32I-NEXT: sb a6, 7(sp)
-; RV32I-NEXT: sb a5, 6(sp)
-; RV32I-NEXT: sb a4, 5(sp)
-; RV32I-NEXT: sb a3, 4(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: addi a0, sp, 4
-; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or t0, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sll a6, t1, a6
+; RV32I-NEXT: or a6, a5, a6
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a6, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a6, 24
+; RV32I-NEXT: sb a0, 11(a2)
+; RV32I-NEXT: srli a0, a6, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, t0, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a0, t0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -823,6 +842,222 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i128 %res, ptr %dst, align 1
ret void
}
+
+define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 9(a0)
+; RV64I-NEXT: lbu a4, 8(a0)
+; RV64I-NEXT: lbu a5, 10(a0)
+; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 13(a0)
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 1(a1)
+; RV64I-NEXT: lbu a6, 0(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: slli a4, a4, 37
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: addi a4, a5, -64
+; RV64I-NEXT: srl a1, a3, a5
+; RV64I-NEXT: bltz a4, .LBB7_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: j .LBB7_3
+; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: lbu a6, 1(a0)
+; RV64I-NEXT: lbu a7, 0(a0)
+; RV64I-NEXT: lbu t0, 2(a0)
+; RV64I-NEXT: lbu t1, 3(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a0)
+; RV64I-NEXT: lbu t0, 4(a0)
+; RV64I-NEXT: lbu t1, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: srl a0, a0, a5
+; RV64I-NEXT: not a5, a5
+; RV64I-NEXT: slli a3, a3, 1
+; RV64I-NEXT: sll a3, a3, a5
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: srai a4, a4, 63
+; RV64I-NEXT: and a1, a4, a1
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 15(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 14(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 12
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: lw a1, 8(a0)
+; RV32I-NEXT: lw a3, 12(a0)
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: sb a1, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: sb a5, 10(a2)
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a1, a4, 16
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: srli a1, a4, 24
+; RV32I-NEXT: sb a1, 3(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 1(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_16bytes:
; RV64I: # %bb.0:
@@ -873,11 +1108,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a4, a1
; RV64I-NEXT: addi a4, a5, -64
; RV64I-NEXT: sll a1, a3, a5
-; RV64I-NEXT: bltz a4, .LBB7_2
+; RV64I-NEXT: bltz a4, .LBB8_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: mv a0, a1
-; RV64I-NEXT: j .LBB7_3
-; RV64I-NEXT: .LBB7_2:
+; RV64I-NEXT: j .LBB8_3
+; RV64I-NEXT: .LBB8_2:
; RV64I-NEXT: lbu a6, 9(a0)
; RV64I-NEXT: lbu a7, 8(a0)
; RV64I-NEXT: lbu t0, 10(a0)
@@ -905,7 +1140,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: srli a3, a3, 1
; RV64I-NEXT: srl a3, a3, a5
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: .LBB7_3:
+; RV64I-NEXT: .LBB8_3:
; RV64I-NEXT: srai a4, a4, 63
; RV64I-NEXT: and a1, a4, a1
; RV64I-NEXT: sb a1, 0(a2)
@@ -942,98 +1177,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb zero, 11(sp)
-; RV32I-NEXT: sb zero, 10(sp)
-; RV32I-NEXT: sb zero, 9(sp)
-; RV32I-NEXT: sb zero, 8(sp)
-; RV32I-NEXT: sb zero, 7(sp)
-; RV32I-NEXT: sb zero, 6(sp)
-; RV32I-NEXT: sb zero, 5(sp)
-; RV32I-NEXT: sb zero, 4(sp)
-; RV32I-NEXT: sb a0, 35(sp)
-; RV32I-NEXT: sb s2, 34(sp)
-; RV32I-NEXT: sb s1, 33(sp)
-; RV32I-NEXT: sb s0, 32(sp)
-; RV32I-NEXT: sb t6, 31(sp)
-; RV32I-NEXT: sb t5, 30(sp)
-; RV32I-NEXT: sb t4, 29(sp)
-; RV32I-NEXT: sb t3, 28(sp)
-; RV32I-NEXT: sb t2, 27(sp)
-; RV32I-NEXT: sb t1, 26(sp)
-; RV32I-NEXT: sb t0, 25(sp)
-; RV32I-NEXT: sb a7, 24(sp)
-; RV32I-NEXT: sb a6, 23(sp)
-; RV32I-NEXT: sb a5, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a3, 20(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: addi a0, sp, 20
-; RV32I-NEXT: sub a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: addi a3, sp, 16
+; RV32I-NEXT: sub a3, a3, a0
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a0, a1
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: srli a7, a4, 1
+; RV32I-NEXT: lw t0, 12(a3)
+; RV32I-NEXT: lw a3, 8(a3)
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: or a7, a5, a7
+; RV32I-NEXT: sll t0, t0, a1
+; RV32I-NEXT: srli t1, a3, 1
+; RV32I-NEXT: srl t1, t1, a6
+; RV32I-NEXT: or t1, t0, t1
+; RV32I-NEXT: sll a3, a3, a1
+; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a3, a3, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, t0, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a5, a5, 24
+; RV32I-NEXT: sb a5, 7(a2)
+; RV32I-NEXT: sb a0, 8(a2)
+; RV32I-NEXT: sb t1, 12(a2)
+; RV32I-NEXT: sb a7, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: srli a0, t1, 16
+; RV32I-NEXT: sb a0, 14(a2)
+; RV32I-NEXT: srli a0, t1, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a7, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a7, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1042,6 +1296,223 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i128 %res, ptr %dst, align 1
ret void
}
+
+define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a1)
+; RV64I-NEXT: lbu a5, 4(a1)
+; RV64I-NEXT: lbu a6, 6(a1)
+; RV64I-NEXT: lbu a7, 7(a1)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 1(a1)
+; RV64I-NEXT: lbu a6, 0(a1)
+; RV64I-NEXT: lbu a7, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: or a1, a1, a5
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: slli a4, a4, 37
+; RV64I-NEXT: or a5, a4, a1
+; RV64I-NEXT: addi a4, a5, -64
+; RV64I-NEXT: sll a1, a3, a5
+; RV64I-NEXT: bltz a4, .LBB9_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: j .LBB9_3
+; RV64I-NEXT: .LBB9_2:
+; RV64I-NEXT: lbu a6, 9(a0)
+; RV64I-NEXT: lbu a7, 8(a0)
+; RV64I-NEXT: lbu t0, 10(a0)
+; RV64I-NEXT: lbu t1, 11(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 13(a0)
+; RV64I-NEXT: lbu t0, 12(a0)
+; RV64I-NEXT: lbu t1, 14(a0)
+; RV64I-NEXT: lbu a0, 15(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: sll a0, a0, a5
+; RV64I-NEXT: not a5, a5
+; RV64I-NEXT: srli a3, a3, 1
+; RV64I-NEXT: srl a3, a3, a5
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: .LBB9_3:
+; RV64I-NEXT: srai a4, a4, 63
+; RV64I-NEXT: and a1, a4, a1
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 7(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 6(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 5(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 12
+; RV32I-NEXT: addi a0, sp, 16
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lw a1, 8(a0)
+; RV32I-NEXT: lw a3, 12(a0)
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: sb a1, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: sb a5, 10(a2)
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a1, a4, 16
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: srli a1, a4, 24
+; RV32I-NEXT: sb a1, 3(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 1(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+
define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_16bytes:
; RV64I: # %bb.0:
@@ -1092,13 +1563,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a5, a1
; RV64I-NEXT: addi a6, a5, -64
; RV64I-NEXT: sra a1, a3, a5
-; RV64I-NEXT: bltz a6, .LBB8_2
+; RV64I-NEXT: bltz a6, .LBB10_2
; RV64I-NEXT: # %bb.1:
; RV64I-NEXT: sraiw a3, a4, 31
; RV64I-NEXT: mv a0, a1
; RV64I-NEXT: mv a1, a3
-; RV64I-NEXT: j .LBB8_3
-; RV64I-NEXT: .LBB8_2:
+; RV64I-NEXT: j .LBB10_3
+; RV64I-NEXT: .LBB10_2:
; RV64I-NEXT: lbu a4, 1(a0)
; RV64I-NEXT: lbu a6, 0(a0)
; RV64I-NEXT: lbu a7, 2(a0)
@@ -1126,7 +1597,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: slli a3, a3, 1
; RV64I-NEXT: sll a3, a3, a4
; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: .LBB8_3:
+; RV64I-NEXT: .LBB10_3:
; RV64I-NEXT: sb a1, 8(a2)
; RV64I-NEXT: srli a3, a1, 56
; RV64I-NEXT: sb a3, 15(a2)
@@ -1161,105 +1632,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -48
-; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t2, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 14(a0)
-; RV32I-NEXT: lbu a0, 13(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 15(sp)
-; RV32I-NEXT: sb s3, 14(sp)
-; RV32I-NEXT: sb a0, 13(sp)
-; RV32I-NEXT: sb s2, 12(sp)
-; RV32I-NEXT: sb s1, 11(sp)
-; RV32I-NEXT: sb s0, 10(sp)
-; RV32I-NEXT: sb t6, 9(sp)
-; RV32I-NEXT: sb t5, 8(sp)
-; RV32I-NEXT: sb t4, 7(sp)
-; RV32I-NEXT: sb t3, 6(sp)
-; RV32I-NEXT: sb t2, 5(sp)
-; RV32I-NEXT: sb t1, 4(sp)
-; RV32I-NEXT: sb t0, 3(sp)
-; RV32I-NEXT: sb a7, 2(sp)
-; RV32I-NEXT: sb a6, 1(sp)
-; RV32I-NEXT: sb a5, 0(sp)
-; RV32I-NEXT: srai a4, a4, 31
-; RV32I-NEXT: sb a4, 28(sp)
-; RV32I-NEXT: sb a4, 24(sp)
-; RV32I-NEXT: sb a4, 20(sp)
-; RV32I-NEXT: sb a4, 16(sp)
-; RV32I-NEXT: srli a0, a4, 24
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: sb a0, 23(sp)
-; RV32I-NEXT: sb a3, 22(sp)
-; RV32I-NEXT: sb a4, 21(sp)
-; RV32I-NEXT: sb a0, 19(sp)
-; RV32I-NEXT: sb a3, 18(sp)
-; RV32I-NEXT: sb a4, 17(sp)
-; RV32I-NEXT: andi a1, a1, 15
-; RV32I-NEXT: mv a0, sp
-; RV32I-NEXT: add a0, a0, a1
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: lbu a4, 7(a0)
-; RV32I-NEXT: lbu a5, 6(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 0(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 2(a0)
-; RV32I-NEXT: lbu t2, 13(a0)
-; RV32I-NEXT: lbu t3, 12(a0)
-; RV32I-NEXT: lbu t4, 15(a0)
-; RV32I-NEXT: lbu t5, 14(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 8(a0)
-; RV32I-NEXT: lbu a0, 9(a0)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb a0, 9(a2)
-; RV32I-NEXT: sb t5, 14(a2)
-; RV32I-NEXT: sb t4, 15(a2)
-; RV32I-NEXT: sb t3, 12(a2)
-; RV32I-NEXT: sb t2, 13(a2)
-; RV32I-NEXT: sb t1, 2(a2)
-; RV32I-NEXT: sb t0, 3(a2)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 24
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or t0, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli t1, a0, 1
+; RV32I-NEXT: sll a6, t1, a6
+; RV32I-NEXT: or a6, a5, a6
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a0, 12(a2)
; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a6, 1(a2)
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: sb a4, 7(a2)
-; RV32I-NEXT: sb a3, 4(a2)
-; RV32I-NEXT: sb a1, 5(a2)
-; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 48
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a6, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a6, 24
+; RV32I-NEXT: sb a0, 11(a2)
+; RV32I-NEXT: srli a0, a6, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, t0, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a0, t0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -1269,441 +1753,645 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
ret void
}
+define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_16bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: lbu a3, 9(a0)
+; RV64I-NEXT: lbu a4, 8(a0)
+; RV64I-NEXT: lbu a5, 10(a0)
+; RV64I-NEXT: lbu a6, 11(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 13(a0)
+; RV64I-NEXT: lbu a5, 12(a0)
+; RV64I-NEXT: lbu a6, 14(a0)
+; RV64I-NEXT: lbu a7, 15(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a5, a4, 32
+; RV64I-NEXT: or a3, a5, a3
+; RV64I-NEXT: lbu a5, 5(a1)
+; RV64I-NEXT: lbu a6, 4(a1)
+; RV64I-NEXT: lbu a7, 6(a1)
+; RV64I-NEXT: lbu t0, 7(a1)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu a1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: slli a1, a1, 5
+; RV64I-NEXT: slli a5, a5, 37
+; RV64I-NEXT: or a5, a5, a1
+; RV64I-NEXT: addi a6, a5, -64
+; RV64I-NEXT: sra a1, a3, a5
+; RV64I-NEXT: bltz a6, .LBB11_2
+; RV64I-NEXT: # %bb.1:
+; RV64I-NEXT: sraiw a3, a4, 31
+; RV64I-NEXT: mv a0, a1
+; RV64I-NEXT: mv a1, a3
+; RV64I-NEXT: j .LBB11_3
+; RV64I-NEXT: .LBB11_2:
+; RV64I-NEXT: lbu a4, 1(a0)
+; RV64I-NEXT: lbu a6, 0(a0)
+; RV64I-NEXT: lbu a7, 2(a0)
+; RV64I-NEXT: lbu t0, 3(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a4, a6, a4
+; RV64I-NEXT: lbu a6, 5(a0)
+; RV64I-NEXT: lbu a7, 4(a0)
+; RV64I-NEXT: lbu t0, 6(a0)
+; RV64I-NEXT: lbu a0, 7(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a4
+; RV64I-NEXT: srl a0, a0, a5
+; RV64I-NEXT: not a4, a5
+; RV64I-NEXT: slli a3, a3, 1
+; RV64I-NEXT: sll a3, a3, a4
+; RV64I-NEXT: or a0, a0, a3
+; RV64I-NEXT: .LBB11_3:
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 15(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 14(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 13(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: sb a0, 0(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 1(a2)
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_16bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 12
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: lw a1, 8(a0)
+; RV32I-NEXT: lw a3, 12(a0)
+; RV32I-NEXT: lw a4, 0(a0)
+; RV32I-NEXT: lw a0, 4(a0)
+; RV32I-NEXT: sb a1, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a4, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a1, 16
+; RV32I-NEXT: sb a5, 10(a2)
+; RV32I-NEXT: srli a5, a1, 24
+; RV32I-NEXT: sb a5, 11(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 9(a2)
+; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 15(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a1, a4, 16
+; RV32I-NEXT: sb a1, 2(a2)
+; RV32I-NEXT: srli a1, a4, 24
+; RV32I-NEXT: sb a1, 3(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 1(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
+; RV32I-NEXT: ret
+ %src = load i128, ptr %src.ptr, align 1
+ %wordOff = load i128, ptr %wordOff.ptr, align 1
+ %bitOff = shl i128 %wordOff, 5
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 87(sp)
-; RV64I-NEXT: sb a4, 86(sp)
-; RV64I-NEXT: sb a0, 85(sp)
-; RV64I-NEXT: sb a5, 84(sp)
-; RV64I-NEXT: sb a6, 83(sp)
-; RV64I-NEXT: sb a7, 82(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t0, 81(sp)
-; RV64I-NEXT: sb ra, 80(sp)
-; RV64I-NEXT: sb s11, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 56
-; RV64I-NEXT: add a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a5, a4, a1
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: srl t0, t0, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: srl a1, a3, a1
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a5, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu t0, 25(a0)
-; RV32I-NEXT: lbu a7, 26(a0)
-; RV32I-NEXT: lbu a6, 27(a0)
-; RV32I-NEXT: lbu a5, 28(a0)
-; RV32I-NEXT: lbu a3, 31(a0)
-; RV32I-NEXT: lbu a4, 30(a0)
-; RV32I-NEXT: lbu a0, 29(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 59(sp)
-; RV32I-NEXT: sb a4, 58(sp)
-; RV32I-NEXT: sb a0, 57(sp)
-; RV32I-NEXT: sb a5, 56(sp)
-; RV32I-NEXT: sb a6, 55(sp)
-; RV32I-NEXT: sb a7, 54(sp)
-; RV32I-NEXT: sb zero, 91(sp)
-; RV32I-NEXT: sb zero, 90(sp)
-; RV32I-NEXT: sb zero, 89(sp)
-; RV32I-NEXT: sb zero, 88(sp)
-; RV32I-NEXT: sb zero, 87(sp)
-; RV32I-NEXT: sb zero, 86(sp)
-; RV32I-NEXT: sb zero, 85(sp)
-; RV32I-NEXT: sb zero, 84(sp)
-; RV32I-NEXT: sb zero, 83(sp)
-; RV32I-NEXT: sb zero, 82(sp)
-; RV32I-NEXT: sb zero, 81(sp)
-; RV32I-NEXT: sb zero, 80(sp)
-; RV32I-NEXT: sb zero, 79(sp)
-; RV32I-NEXT: sb zero, 78(sp)
-; RV32I-NEXT: sb zero, 77(sp)
-; RV32I-NEXT: sb zero, 76(sp)
-; RV32I-NEXT: sb zero, 75(sp)
-; RV32I-NEXT: sb zero, 74(sp)
-; RV32I-NEXT: sb zero, 73(sp)
-; RV32I-NEXT: sb zero, 72(sp)
-; RV32I-NEXT: sb zero, 71(sp)
-; RV32I-NEXT: sb zero, 70(sp)
-; RV32I-NEXT: sb zero, 69(sp)
-; RV32I-NEXT: sb zero, 68(sp)
-; RV32I-NEXT: sb zero, 67(sp)
-; RV32I-NEXT: sb zero, 66(sp)
-; RV32I-NEXT: sb zero, 65(sp)
-; RV32I-NEXT: sb zero, 64(sp)
-; RV32I-NEXT: sb zero, 63(sp)
-; RV32I-NEXT: sb zero, 62(sp)
-; RV32I-NEXT: sb zero, 61(sp)
-; RV32I-NEXT: sb zero, 60(sp)
-; RV32I-NEXT: sb t0, 53(sp)
-; RV32I-NEXT: sb ra, 52(sp)
-; RV32I-NEXT: sb s11, 51(sp)
-; RV32I-NEXT: sb s10, 50(sp)
-; RV32I-NEXT: sb s9, 49(sp)
-; RV32I-NEXT: sb s8, 48(sp)
-; RV32I-NEXT: sb s7, 47(sp)
-; RV32I-NEXT: sb s6, 46(sp)
-; RV32I-NEXT: sb s5, 45(sp)
-; RV32I-NEXT: sb s4, 44(sp)
-; RV32I-NEXT: sb s3, 43(sp)
-; RV32I-NEXT: sb s2, 42(sp)
-; RV32I-NEXT: sb s1, 41(sp)
-; RV32I-NEXT: sb s0, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: andi a1, a1, 31
-; RV32I-NEXT: addi a0, sp, 28
-; RV32I-NEXT: add a6, a0, a1
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t2, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a5, a3, a0
+; RV32I-NEXT: lw a3, 4(a5)
+; RV32I-NEXT: slli a6, a1, 3
+; RV32I-NEXT: srl a4, a3, a6
+; RV32I-NEXT: lw a7, 8(a5)
+; RV32I-NEXT: andi a0, a6, 24
+; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: lw a1, 0(a5)
+; RV32I-NEXT: slli a0, a7, 1
+; RV32I-NEXT: sll a0, a0, t0
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: srl t1, a1, a6
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw t2, 12(a5)
+; RV32I-NEXT: lw t3, 16(a5)
+; RV32I-NEXT: sll a1, a3, t0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: srl t4, t2, a6
+; RV32I-NEXT: slli a3, t3, 1
+; RV32I-NEXT: sll a3, a3, t0
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw t5, 20(a5)
+; RV32I-NEXT: lw t6, 24(a5)
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: or t2, a7, t2
+; RV32I-NEXT: srl s0, t5, a6
+; RV32I-NEXT: slli s1, t6, 1
+; RV32I-NEXT: sll s1, s1, t0
+; RV32I-NEXT: or s1, s0, s1
+; RV32I-NEXT: srl t3, t3, a6
+; RV32I-NEXT: slli t5, t5, 1
+; RV32I-NEXT: lw a5, 28(a5)
+; RV32I-NEXT: sll t5, t5, t0
+; RV32I-NEXT: or t5, t3, t5
+; RV32I-NEXT: srl t6, t6, a6
+; RV32I-NEXT: slli s2, a5, 1
+; RV32I-NEXT: sll t0, s2, t0
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: srl a5, a5, a6
+; RV32I-NEXT: sb t6, 24(a2)
; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
+; RV32I-NEXT: sb t3, 16(a2)
+; RV32I-NEXT: sb s0, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sb t1, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 31(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 27(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t5, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t5, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t5, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, s1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, s1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli s1, s1, 8
+; RV32I-NEXT: sb s1, 21(a2)
+; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -1712,441 +2400,1167 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i256 %res, ptr %dst, align 1
ret void
}
-define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: shl_32bytes:
+
+define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_wordOff:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: lbu s11, 23(a0)
-; RV64I-NEXT: lbu ra, 24(a0)
-; RV64I-NEXT: lbu t0, 25(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu a6, 27(a0)
-; RV64I-NEXT: lbu a5, 28(a0)
-; RV64I-NEXT: lbu a3, 31(a0)
-; RV64I-NEXT: lbu a4, 30(a0)
-; RV64I-NEXT: lbu a0, 29(a0)
-; RV64I-NEXT: lbu a1, 0(a1)
-; RV64I-NEXT: sb a3, 119(sp)
-; RV64I-NEXT: sb a4, 118(sp)
-; RV64I-NEXT: sb a0, 117(sp)
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t0, 113(sp)
-; RV64I-NEXT: sb ra, 112(sp)
-; RV64I-NEXT: sb s11, 111(sp)
-; RV64I-NEXT: sb s10, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: andi a1, a1, 31
-; RV64I-NEXT: addi a0, sp, 88
-; RV64I-NEXT: sub a6, a0, a1
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
-; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a0, a1, 2
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a5, a1, 5
+; RV64I-NEXT: srl a1, a4, a5
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a5, 32
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srl t0, t0, a5
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a5
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: srl a3, a3, a5
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a5, a6, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a6, 16
; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT: srli a5, a6, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a5, a3, 56
+; RV64I-NEXT: sb a5, 31(a2)
+; RV64I-NEXT: srli a5, a3, 48
+; RV64I-NEXT: sb a5, 30(a2)
+; RV64I-NEXT: srli a5, a3, 40
+; RV64I-NEXT: sb a5, 29(a2)
+; RV64I-NEXT: srli a5, a3, 32
+; RV64I-NEXT: sb a5, 28(a2)
+; RV64I-NEXT: srli a5, a3, 24
+; RV64I-NEXT: sb a5, 27(a2)
+; RV64I-NEXT: srli a5, a3, 16
+; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a3, t0, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, t0, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a3, t0, 8
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a4, a4, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: lshr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: mv a0, sp
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ld a1, 16(a0)
+; RV64I-NEXT: ld a3, 24(a0)
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a0, 8(a0)
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 23(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 22(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 21(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 20(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: lshr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: ld a5, 0(a3)
+; RV64I-NEXT: sll a6, a4, a1
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: srli a0, a5, 1
+; RV64I-NEXT: ld t0, 24(a3)
+; RV64I-NEXT: ld a3, 16(a3)
+; RV64I-NEXT: srl a0, a0, a7
+; RV64I-NEXT: or a0, a6, a0
+; RV64I-NEXT: sll t0, t0, a1
+; RV64I-NEXT: srli t1, a3, 1
+; RV64I-NEXT: srl t1, t1, a7
+; RV64I-NEXT: or t1, t0, t1
+; RV64I-NEXT: sll a3, a3, a1
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a7
+; RV64I-NEXT: or a4, a3, a4
+; RV64I-NEXT: sll a1, a5, a1
+; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: srli a3, a3, 56
+; RV64I-NEXT: sb a3, 23(a2)
+; RV64I-NEXT: srli a3, t0, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 7(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 6(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 5(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 4(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: srli a1, a6, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb t1, 24(a2)
; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a1, t1, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, t1, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, t1, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, t1, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, t1, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a1, t1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s10, 22(a0)
-; RV32I-NEXT: lbu s11, 23(a0)
-; RV32I-NEXT: lbu ra, 24(a0)
-; RV32I-NEXT: lbu t0, 25(a0)
-; RV32I-NEXT: lbu a7, 26(a0)
-; RV32I-NEXT: lbu a6, 27(a0)
-; RV32I-NEXT: lbu a5, 28(a0)
-; RV32I-NEXT: lbu a3, 31(a0)
-; RV32I-NEXT: lbu a4, 30(a0)
-; RV32I-NEXT: lbu a0, 29(a0)
-; RV32I-NEXT: lbu a1, 0(a1)
-; RV32I-NEXT: sb a3, 91(sp)
-; RV32I-NEXT: sb a4, 90(sp)
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a5, 88(sp)
-; RV32I-NEXT: sb a6, 87(sp)
-; RV32I-NEXT: sb a7, 86(sp)
-; RV32I-NEXT: sb zero, 59(sp)
-; RV32I-NEXT: sb zero, 58(sp)
-; RV32I-NEXT: sb zero, 57(sp)
-; RV32I-NEXT: sb zero, 56(sp)
-; RV32I-NEXT: sb zero, 55(sp)
-; RV32I-NEXT: sb zero, 54(sp)
-; RV32I-NEXT: sb zero, 53(sp)
-; RV32I-NEXT: sb zero, 52(sp)
-; RV32I-NEXT: sb zero, 51(sp)
-; RV32I-NEXT: sb zero, 50(sp)
-; RV32I-NEXT: sb zero, 49(sp)
-; RV32I-NEXT: sb zero, 48(sp)
-; RV32I-NEXT: sb zero, 47(sp)
-; RV32I-NEXT: sb zero, 46(sp)
-; RV32I-NEXT: sb zero, 45(sp)
-; RV32I-NEXT: sb zero, 44(sp)
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb t0, 85(sp)
-; RV32I-NEXT: sb ra, 84(sp)
-; RV32I-NEXT: sb s11, 83(sp)
-; RV32I-NEXT: sb s10, 82(sp)
-; RV32I-NEXT: sb s9, 81(sp)
-; RV32I-NEXT: sb s8, 80(sp)
-; RV32I-NEXT: sb s7, 79(sp)
-; RV32I-NEXT: sb s6, 78(sp)
-; RV32I-NEXT: sb s5, 77(sp)
-; RV32I-NEXT: sb s4, 76(sp)
-; RV32I-NEXT: sb s3, 75(sp)
-; RV32I-NEXT: sb s2, 74(sp)
-; RV32I-NEXT: sb s1, 73(sp)
-; RV32I-NEXT: sb s0, 72(sp)
-; RV32I-NEXT: sb t6, 71(sp)
-; RV32I-NEXT: sb t5, 70(sp)
-; RV32I-NEXT: sb t4, 69(sp)
-; RV32I-NEXT: sb t3, 68(sp)
-; RV32I-NEXT: sb t2, 67(sp)
-; RV32I-NEXT: sb t1, 66(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 63(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 62(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: andi a1, a1, 31
-; RV32I-NEXT: addi a0, sp, 60
-; RV32I-NEXT: sub a6, a0, a1
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu t2, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: addi a3, sp, 32
+; RV32I-NEXT: sub a6, a3, a0
+; RV32I-NEXT: lw a3, 4(a6)
+; RV32I-NEXT: slli a7, a1, 3
+; RV32I-NEXT: lw t0, 0(a6)
+; RV32I-NEXT: sll a4, a3, a7
+; RV32I-NEXT: andi a0, a7, 24
+; RV32I-NEXT: xori t1, a0, 31
+; RV32I-NEXT: srli a0, t0, 1
+; RV32I-NEXT: lw t2, 12(a6)
+; RV32I-NEXT: lw a5, 8(a6)
+; RV32I-NEXT: srl a0, a0, t1
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: sll t3, t2, a7
+; RV32I-NEXT: srli a1, a5, 1
+; RV32I-NEXT: srl a1, a1, t1
+; RV32I-NEXT: or a1, t3, a1
+; RV32I-NEXT: sll t4, a5, a7
+; RV32I-NEXT: srli a3, a3, 1
+; RV32I-NEXT: lw t5, 20(a6)
+; RV32I-NEXT: lw t6, 16(a6)
+; RV32I-NEXT: srl a3, a3, t1
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: sll s0, t5, a7
+; RV32I-NEXT: srli a5, t6, 1
+; RV32I-NEXT: srl a5, a5, t1
+; RV32I-NEXT: or a5, s0, a5
+; RV32I-NEXT: sll t6, t6, a7
+; RV32I-NEXT: srli t2, t2, 1
+; RV32I-NEXT: lw s1, 28(a6)
+; RV32I-NEXT: lw a6, 24(a6)
+; RV32I-NEXT: srl t2, t2, t1
+; RV32I-NEXT: or t2, t6, t2
+; RV32I-NEXT: sll s1, s1, a7
+; RV32I-NEXT: srli s2, a6, 1
+; RV32I-NEXT: srl s2, s2, t1
+; RV32I-NEXT: or s2, s1, s2
+; RV32I-NEXT: sll a6, a6, a7
+; RV32I-NEXT: srli t5, t5, 1
+; RV32I-NEXT: srl t1, t5, t1
+; RV32I-NEXT: or t1, a6, t1
+; RV32I-NEXT: sll a7, t0, a7
+; RV32I-NEXT: sb a7, 0(a2)
+; RV32I-NEXT: srli a6, a6, 24
+; RV32I-NEXT: sb a6, 27(a2)
+; RV32I-NEXT: srli s1, s1, 24
+; RV32I-NEXT: sb s1, 31(a2)
+; RV32I-NEXT: srli a6, t6, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli s0, s0, 24
+; RV32I-NEXT: sb s0, 23(a2)
+; RV32I-NEXT: srli a6, t4, 24
+; RV32I-NEXT: sb a6, 11(a2)
+; RV32I-NEXT: srli a6, t3, 24
+; RV32I-NEXT: sb a6, 15(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 2(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 1(a2)
+; RV32I-NEXT: srli a4, a4, 24
+; RV32I-NEXT: sb a4, 7(a2)
+; RV32I-NEXT: sb t1, 24(a2)
+; RV32I-NEXT: sb s2, 28(a2)
+; RV32I-NEXT: sb t2, 16(a2)
+; RV32I-NEXT: sb a5, 20(a2)
+; RV32I-NEXT: sb a3, 8(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, s2, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a4, s2, 8
; RV32I-NEXT: sb a4, 29(a2)
-; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
-; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 21(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 13(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2155,457 +3569,1169 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i256 %res, ptr %dst, align 1
ret void
}
-define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; RV64I-LABEL: ashr_32bytes:
+
+define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_wordOff:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: mv t0, a1
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a1, 0(a0)
-; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 1(a0)
-; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 2(a0)
-; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 3(a0)
-; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 4(a0)
-; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a1, 5(a0)
-; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t2, 6(a0)
-; RV64I-NEXT: lbu t3, 7(a0)
-; RV64I-NEXT: lbu t4, 8(a0)
-; RV64I-NEXT: lbu t5, 9(a0)
-; RV64I-NEXT: lbu t6, 10(a0)
-; RV64I-NEXT: lbu s0, 11(a0)
-; RV64I-NEXT: lbu s1, 12(a0)
-; RV64I-NEXT: lbu s2, 13(a0)
-; RV64I-NEXT: lbu s3, 14(a0)
-; RV64I-NEXT: lbu s4, 15(a0)
-; RV64I-NEXT: lbu s5, 16(a0)
-; RV64I-NEXT: lbu s6, 17(a0)
-; RV64I-NEXT: lbu s7, 18(a0)
-; RV64I-NEXT: lbu s8, 19(a0)
-; RV64I-NEXT: lbu s9, 20(a0)
-; RV64I-NEXT: lbu s10, 21(a0)
-; RV64I-NEXT: lbu s11, 22(a0)
-; RV64I-NEXT: lbu ra, 23(a0)
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: slli a0, a1, 2
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a0, a3, a0
+; RV64I-NEXT: ld a4, 8(a0)
+; RV64I-NEXT: slli a5, a1, 5
+; RV64I-NEXT: ld a6, 0(a0)
+; RV64I-NEXT: sll a3, a4, a5
+; RV64I-NEXT: andi a1, a5, 32
+; RV64I-NEXT: xori a7, a1, 63
+; RV64I-NEXT: srli a1, a6, 1
+; RV64I-NEXT: ld t0, 24(a0)
+; RV64I-NEXT: ld t1, 16(a0)
+; RV64I-NEXT: srl a0, a1, a7
+; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: sll t0, t0, a5
+; RV64I-NEXT: srli a1, t1, 1
+; RV64I-NEXT: srl a1, a1, a7
+; RV64I-NEXT: or a1, t0, a1
+; RV64I-NEXT: sll t1, t1, a5
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a7
+; RV64I-NEXT: or a4, t1, a4
+; RV64I-NEXT: sll a5, a6, a5
+; RV64I-NEXT: sb a5, 0(a2)
+; RV64I-NEXT: srli a6, t1, 56
+; RV64I-NEXT: sb a6, 23(a2)
+; RV64I-NEXT: srli a6, t1, 48
+; RV64I-NEXT: sb a6, 22(a2)
+; RV64I-NEXT: srli a6, t1, 40
+; RV64I-NEXT: sb a6, 21(a2)
+; RV64I-NEXT: srli a6, t1, 32
+; RV64I-NEXT: sb a6, 20(a2)
+; RV64I-NEXT: srli a6, t0, 56
+; RV64I-NEXT: sb a6, 31(a2)
+; RV64I-NEXT: srli a6, t0, 48
+; RV64I-NEXT: sb a6, 30(a2)
+; RV64I-NEXT: srli a6, t0, 40
+; RV64I-NEXT: sb a6, 29(a2)
+; RV64I-NEXT: srli a6, t0, 32
+; RV64I-NEXT: sb a6, 28(a2)
+; RV64I-NEXT: srli a6, a5, 56
+; RV64I-NEXT: sb a6, 7(a2)
+; RV64I-NEXT: srli a6, a5, 48
+; RV64I-NEXT: sb a6, 6(a2)
+; RV64I-NEXT: srli a6, a5, 40
+; RV64I-NEXT: sb a6, 5(a2)
+; RV64I-NEXT: srli a6, a5, 32
+; RV64I-NEXT: sb a6, 4(a2)
+; RV64I-NEXT: srli a6, a5, 24
+; RV64I-NEXT: sb a6, 3(a2)
+; RV64I-NEXT: srli a6, a5, 16
+; RV64I-NEXT: sb a6, 2(a2)
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: sb a5, 1(a2)
+; RV64I-NEXT: srli a5, a3, 56
+; RV64I-NEXT: sb a5, 15(a2)
+; RV64I-NEXT: srli a5, a3, 48
+; RV64I-NEXT: sb a5, 14(a2)
+; RV64I-NEXT: srli a5, a3, 40
+; RV64I-NEXT: sb a5, 13(a2)
+; RV64I-NEXT: srli a3, a3, 32
+; RV64I-NEXT: sb a3, 12(a2)
+; RV64I-NEXT: sb a4, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a3, a4, 24
+; RV64I-NEXT: sb a3, 19(a2)
+; RV64I-NEXT: srli a3, a4, 16
+; RV64I-NEXT: sb a3, 18(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 17(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: shl_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: addi a0, sp, 32
+; RV32I-NEXT: sub a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: shl_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a4, 27(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: lbu t0, 0(t0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a4, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb ra, 79(sp)
-; RV64I-NEXT: sb s11, 78(sp)
-; RV64I-NEXT: sb s10, 77(sp)
-; RV64I-NEXT: sb s9, 76(sp)
-; RV64I-NEXT: sb s8, 75(sp)
-; RV64I-NEXT: sb s7, 74(sp)
-; RV64I-NEXT: sb s6, 73(sp)
-; RV64I-NEXT: sb s5, 72(sp)
-; RV64I-NEXT: sb s4, 71(sp)
-; RV64I-NEXT: sb s3, 70(sp)
-; RV64I-NEXT: sb s2, 69(sp)
-; RV64I-NEXT: sb s1, 68(sp)
-; RV64I-NEXT: sb s0, 67(sp)
-; RV64I-NEXT: sb t6, 66(sp)
-; RV64I-NEXT: sb t5, 65(sp)
-; RV64I-NEXT: sb t4, 64(sp)
-; RV64I-NEXT: sb t3, 63(sp)
-; RV64I-NEXT: sb t2, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: addi a0, sp, 32
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: ld a1, 16(a0)
+; RV64I-NEXT: ld a3, 24(a0)
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a0, 8(a0)
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 23(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 22(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 21(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 20(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: andi a0, t0, 31
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a6, a1, a0
-; RV64I-NEXT: lbu a0, 8(a6)
-; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 9(a6)
-; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 10(a6)
-; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 11(a6)
-; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a0, 12(a6)
-; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a7, 13(a6)
-; RV64I-NEXT: lbu t0, 14(a6)
-; RV64I-NEXT: lbu t1, 15(a6)
-; RV64I-NEXT: lbu t2, 0(a6)
-; RV64I-NEXT: lbu t3, 1(a6)
-; RV64I-NEXT: lbu t4, 2(a6)
-; RV64I-NEXT: lbu t5, 3(a6)
-; RV64I-NEXT: lbu t6, 4(a6)
-; RV64I-NEXT: lbu s0, 5(a6)
-; RV64I-NEXT: lbu s1, 6(a6)
-; RV64I-NEXT: lbu s2, 7(a6)
-; RV64I-NEXT: lbu s3, 24(a6)
-; RV64I-NEXT: lbu s4, 25(a6)
-; RV64I-NEXT: lbu s5, 26(a6)
-; RV64I-NEXT: lbu s6, 27(a6)
-; RV64I-NEXT: lbu s7, 28(a6)
-; RV64I-NEXT: lbu s8, 29(a6)
-; RV64I-NEXT: lbu s9, 30(a6)
-; RV64I-NEXT: lbu s10, 31(a6)
-; RV64I-NEXT: lbu s11, 16(a6)
-; RV64I-NEXT: lbu ra, 17(a6)
-; RV64I-NEXT: lbu a5, 18(a6)
-; RV64I-NEXT: lbu a4, 19(a6)
-; RV64I-NEXT: lbu a0, 23(a6)
-; RV64I-NEXT: lbu a1, 22(a6)
-; RV64I-NEXT: lbu a3, 21(a6)
-; RV64I-NEXT: lbu a6, 20(a6)
-; RV64I-NEXT: sb a0, 23(a2)
-; RV64I-NEXT: sb a1, 22(a2)
-; RV64I-NEXT: sb a3, 21(a2)
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: sb a5, 18(a2)
-; RV64I-NEXT: sb ra, 17(a2)
-; RV64I-NEXT: sb s11, 16(a2)
-; RV64I-NEXT: sb s10, 31(a2)
-; RV64I-NEXT: sb s9, 30(a2)
-; RV64I-NEXT: sb s8, 29(a2)
-; RV64I-NEXT: sb s7, 28(a2)
-; RV64I-NEXT: sb s6, 27(a2)
-; RV64I-NEXT: sb s5, 26(a2)
-; RV64I-NEXT: sb s4, 25(a2)
-; RV64I-NEXT: sb s3, 24(a2)
-; RV64I-NEXT: sb s2, 7(a2)
-; RV64I-NEXT: sb s1, 6(a2)
-; RV64I-NEXT: sb s0, 5(a2)
-; RV64I-NEXT: sb t6, 4(a2)
-; RV64I-NEXT: sb t5, 3(a2)
-; RV64I-NEXT: sb t4, 2(a2)
-; RV64I-NEXT: sb t3, 1(a2)
-; RV64I-NEXT: sb t2, 0(a2)
-; RV64I-NEXT: sb t1, 15(a2)
-; RV64I-NEXT: sb t0, 14(a2)
-; RV64I-NEXT: sb a7, 13(a2)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 12(a2)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 11(a2)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 10(a2)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: sb a0, 9(a2)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
-; RV32I-LABEL: ashr_32bytes:
+; RV32I-LABEL: shl_32bytes_dwordOff:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: mv t0, a1
-; RV32I-NEXT: lbu t1, 31(a0)
-; RV32I-NEXT: lbu a1, 0(a0)
-; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 1(a0)
-; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 2(a0)
-; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 3(a0)
-; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 4(a0)
-; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a1, 5(a0)
-; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t2, 6(a0)
-; RV32I-NEXT: lbu t3, 7(a0)
-; RV32I-NEXT: lbu t4, 8(a0)
-; RV32I-NEXT: lbu t5, 9(a0)
-; RV32I-NEXT: lbu t6, 10(a0)
-; RV32I-NEXT: lbu s0, 11(a0)
-; RV32I-NEXT: lbu s1, 12(a0)
-; RV32I-NEXT: lbu s2, 13(a0)
-; RV32I-NEXT: lbu s3, 14(a0)
-; RV32I-NEXT: lbu s4, 15(a0)
-; RV32I-NEXT: lbu s5, 16(a0)
-; RV32I-NEXT: lbu s6, 17(a0)
-; RV32I-NEXT: lbu s7, 18(a0)
-; RV32I-NEXT: lbu s8, 19(a0)
-; RV32I-NEXT: lbu s9, 20(a0)
-; RV32I-NEXT: lbu s10, 21(a0)
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: lbu a7, 24(a0)
-; RV32I-NEXT: lbu a6, 25(a0)
-; RV32I-NEXT: lbu a5, 26(a0)
-; RV32I-NEXT: lbu a4, 27(a0)
-; RV32I-NEXT: lbu a1, 30(a0)
-; RV32I-NEXT: lbu a3, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: lbu t0, 0(t0)
-; RV32I-NEXT: sb a1, 58(sp)
-; RV32I-NEXT: sb a3, 57(sp)
-; RV32I-NEXT: sb a0, 56(sp)
-; RV32I-NEXT: sb a4, 55(sp)
-; RV32I-NEXT: sb a5, 54(sp)
-; RV32I-NEXT: sb a6, 53(sp)
-; RV32I-NEXT: sb t1, 59(sp)
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: sb a7, 52(sp)
-; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s11, 50(sp)
-; RV32I-NEXT: sb s10, 49(sp)
-; RV32I-NEXT: sb s9, 48(sp)
-; RV32I-NEXT: sb s8, 47(sp)
-; RV32I-NEXT: sb s7, 46(sp)
-; RV32I-NEXT: sb s6, 45(sp)
-; RV32I-NEXT: sb s5, 44(sp)
-; RV32I-NEXT: sb s4, 43(sp)
-; RV32I-NEXT: sb s3, 42(sp)
-; RV32I-NEXT: sb s2, 41(sp)
-; RV32I-NEXT: sb s1, 40(sp)
-; RV32I-NEXT: sb s0, 39(sp)
-; RV32I-NEXT: sb t6, 38(sp)
-; RV32I-NEXT: sb t5, 37(sp)
-; RV32I-NEXT: sb t4, 36(sp)
-; RV32I-NEXT: sb t3, 35(sp)
-; RV32I-NEXT: sb t2, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: srai a0, t1, 31
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a0, 84(sp)
-; RV32I-NEXT: sb a0, 80(sp)
-; RV32I-NEXT: sb a0, 76(sp)
-; RV32I-NEXT: sb a0, 72(sp)
-; RV32I-NEXT: sb a0, 68(sp)
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: sb a0, 60(sp)
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t1, 56(sp)
+; RV32I-NEXT: sw t0, 52(sp)
+; RV32I-NEXT: sw a7, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: addi a0, sp, 32
+; RV32I-NEXT: sub a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
; RV32I-NEXT: srli a1, a0, 24
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 90(sp)
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a1, 87(sp)
-; RV32I-NEXT: sb a3, 86(sp)
-; RV32I-NEXT: sb a0, 85(sp)
-; RV32I-NEXT: sb a1, 83(sp)
-; RV32I-NEXT: sb a3, 82(sp)
-; RV32I-NEXT: sb a0, 81(sp)
-; RV32I-NEXT: sb a1, 79(sp)
-; RV32I-NEXT: sb a3, 78(sp)
-; RV32I-NEXT: sb a0, 77(sp)
-; RV32I-NEXT: sb a1, 75(sp)
-; RV32I-NEXT: sb a3, 74(sp)
-; RV32I-NEXT: sb a0, 73(sp)
-; RV32I-NEXT: sb a1, 71(sp)
-; RV32I-NEXT: sb a3, 70(sp)
-; RV32I-NEXT: sb a0, 69(sp)
-; RV32I-NEXT: sb a1, 67(sp)
-; RV32I-NEXT: sb a3, 66(sp)
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: sb a1, 63(sp)
-; RV32I-NEXT: sb a3, 62(sp)
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: andi a0, t0, 31
-; RV32I-NEXT: addi a1, sp, 28
-; RV32I-NEXT: add a6, a1, a0
-; RV32I-NEXT: lbu a0, 6(a6)
-; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 7(a6)
-; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 4(a6)
-; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 5(a6)
-; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a0, 0(a6)
-; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a7, 1(a6)
-; RV32I-NEXT: lbu t0, 2(a6)
-; RV32I-NEXT: lbu t1, 3(a6)
-; RV32I-NEXT: lbu t2, 14(a6)
-; RV32I-NEXT: lbu t3, 15(a6)
-; RV32I-NEXT: lbu t4, 12(a6)
-; RV32I-NEXT: lbu t5, 13(a6)
-; RV32I-NEXT: lbu t6, 10(a6)
-; RV32I-NEXT: lbu s0, 11(a6)
-; RV32I-NEXT: lbu s1, 8(a6)
-; RV32I-NEXT: lbu s2, 9(a6)
-; RV32I-NEXT: lbu s3, 22(a6)
-; RV32I-NEXT: lbu s4, 23(a6)
-; RV32I-NEXT: lbu s5, 20(a6)
-; RV32I-NEXT: lbu s6, 21(a6)
-; RV32I-NEXT: lbu s7, 18(a6)
-; RV32I-NEXT: lbu s8, 19(a6)
-; RV32I-NEXT: lbu s9, 16(a6)
-; RV32I-NEXT: lbu s10, 17(a6)
-; RV32I-NEXT: lbu s11, 30(a6)
-; RV32I-NEXT: lbu ra, 31(a6)
-; RV32I-NEXT: lbu a5, 28(a6)
-; RV32I-NEXT: lbu a4, 29(a6)
-; RV32I-NEXT: lbu a0, 25(a6)
-; RV32I-NEXT: lbu a1, 24(a6)
-; RV32I-NEXT: lbu a3, 27(a6)
-; RV32I-NEXT: lbu a6, 26(a6)
-; RV32I-NEXT: sb a0, 25(a2)
-; RV32I-NEXT: sb a1, 24(a2)
-; RV32I-NEXT: sb a3, 27(a2)
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: andi a0, a1, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: srl a5, a4, a1
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a1, 56
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a5, a0
+; RV64I-NEXT: srl t0, t0, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a1
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a5, 8(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_32bytes:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu t3, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli t3, t3, 8
+; RV32I-NEXT: or t3, t3, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a1, a1, t3
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: andi a0, a1, 28
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a5, a3, a0
+; RV32I-NEXT: lw a3, 4(a5)
+; RV32I-NEXT: slli a6, a1, 3
+; RV32I-NEXT: srl a4, a3, a6
+; RV32I-NEXT: lw a7, 8(a5)
+; RV32I-NEXT: andi a0, a6, 24
+; RV32I-NEXT: xori t0, a0, 31
+; RV32I-NEXT: lw a1, 0(a5)
+; RV32I-NEXT: slli a0, a7, 1
+; RV32I-NEXT: sll a0, a0, t0
+; RV32I-NEXT: or a0, a4, a0
+; RV32I-NEXT: srl t1, a1, a6
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw t2, 12(a5)
+; RV32I-NEXT: lw t3, 16(a5)
+; RV32I-NEXT: sll a1, a3, t0
+; RV32I-NEXT: or a1, t1, a1
+; RV32I-NEXT: srl t4, t2, a6
+; RV32I-NEXT: slli a3, t3, 1
+; RV32I-NEXT: sll a3, a3, t0
+; RV32I-NEXT: or a3, t4, a3
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw t5, 20(a5)
+; RV32I-NEXT: lw t6, 24(a5)
+; RV32I-NEXT: sll t2, t2, t0
+; RV32I-NEXT: or t2, a7, t2
+; RV32I-NEXT: srl s0, t5, a6
+; RV32I-NEXT: slli s1, t6, 1
+; RV32I-NEXT: sll s1, s1, t0
+; RV32I-NEXT: or s1, s0, s1
+; RV32I-NEXT: srl t3, t3, a6
+; RV32I-NEXT: slli t5, t5, 1
+; RV32I-NEXT: lw a5, 28(a5)
+; RV32I-NEXT: sll t5, t5, t0
+; RV32I-NEXT: or t5, t3, t5
+; RV32I-NEXT: srl t6, t6, a6
+; RV32I-NEXT: slli s2, a5, 1
+; RV32I-NEXT: sll t0, s2, t0
+; RV32I-NEXT: or t0, t6, t0
+; RV32I-NEXT: sra a5, a5, a6
+; RV32I-NEXT: sb t6, 24(a2)
; RV32I-NEXT: sb a5, 28(a2)
-; RV32I-NEXT: sb ra, 31(a2)
-; RV32I-NEXT: sb s11, 30(a2)
-; RV32I-NEXT: sb s10, 17(a2)
-; RV32I-NEXT: sb s9, 16(a2)
-; RV32I-NEXT: sb s8, 19(a2)
-; RV32I-NEXT: sb s7, 18(a2)
-; RV32I-NEXT: sb s6, 21(a2)
-; RV32I-NEXT: sb s5, 20(a2)
-; RV32I-NEXT: sb s4, 23(a2)
-; RV32I-NEXT: sb s3, 22(a2)
-; RV32I-NEXT: sb s2, 9(a2)
-; RV32I-NEXT: sb s1, 8(a2)
-; RV32I-NEXT: sb s0, 11(a2)
-; RV32I-NEXT: sb t6, 10(a2)
-; RV32I-NEXT: sb t5, 13(a2)
+; RV32I-NEXT: sb t3, 16(a2)
+; RV32I-NEXT: sb s0, 20(a2)
+; RV32I-NEXT: sb a7, 8(a2)
; RV32I-NEXT: sb t4, 12(a2)
-; RV32I-NEXT: sb t3, 15(a2)
-; RV32I-NEXT: sb t2, 14(a2)
-; RV32I-NEXT: sb t1, 3(a2)
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a7, 1(a2)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 0(a2)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: sb t1, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 31(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 30(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 27(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t5, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t5, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t5, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, s1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, s1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli s1, s1, 8
+; RV32I-NEXT: sb s1, 21(a2)
+; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, t2, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a4, t2, 8
+; RV32I-NEXT: sb a4, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 6(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 80
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2614,3 +4740,744 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
store i256 %res, ptr %dst, align 1
ret void
}
+
+define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_wordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli t2, t2, 24
+; RV64I-NEXT: or t0, t2, t1
+; RV64I-NEXT: or a7, t0, a7
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli t0, t0, 8
+; RV64I-NEXT: or t0, t0, t1
+; RV64I-NEXT: slli t2, t2, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t2
+; RV64I-NEXT: or a1, a1, t0
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a0, a1, 2
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: slli a5, a1, 5
+; RV64I-NEXT: srl a1, a4, a5
+; RV64I-NEXT: ld a6, 16(a3)
+; RV64I-NEXT: andi a0, a5, 32
+; RV64I-NEXT: xori a7, a0, 63
+; RV64I-NEXT: ld t0, 0(a3)
+; RV64I-NEXT: slli a0, a6, 1
+; RV64I-NEXT: sll a0, a0, a7
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srl t0, t0, a5
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a7
+; RV64I-NEXT: or a4, t0, a4
+; RV64I-NEXT: srl a6, a6, a5
+; RV64I-NEXT: slli t1, a3, 1
+; RV64I-NEXT: sll a7, t1, a7
+; RV64I-NEXT: or a7, a6, a7
+; RV64I-NEXT: sra a3, a3, a5
+; RV64I-NEXT: sb a6, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb t0, 0(a2)
+; RV64I-NEXT: sb a1, 8(a2)
+; RV64I-NEXT: srli a5, a6, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a6, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a5, a6, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a5, a3, 56
+; RV64I-NEXT: sb a5, 31(a2)
+; RV64I-NEXT: srli a5, a3, 48
+; RV64I-NEXT: sb a5, 30(a2)
+; RV64I-NEXT: srli a5, a3, 40
+; RV64I-NEXT: sb a5, 29(a2)
+; RV64I-NEXT: srli a5, a3, 32
+; RV64I-NEXT: sb a5, 28(a2)
+; RV64I-NEXT: srli a5, a3, 24
+; RV64I-NEXT: sb a5, 27(a2)
+; RV64I-NEXT: srli a5, a3, 16
+; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a3, t0, 24
+; RV64I-NEXT: sb a3, 3(a2)
+; RV64I-NEXT: srli a3, t0, 16
+; RV64I-NEXT: sb a3, 2(a2)
+; RV64I-NEXT: srli a3, t0, 8
+; RV64I-NEXT: sb a3, 1(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 11(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 10(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 9(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a4, a4, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a0, a0, 32
+; RV64I-NEXT: sb a0, 12(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_32bytes_wordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: andi a1, a1, 28
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %wordOff = load i256, ptr %wordOff.ptr, align 1
+ %bitOff = shl i256 %wordOff, 5
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; RV64I-LABEL: ashr_32bytes_dwordOff:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: lbu a3, 1(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: slli a5, a5, 16
+; RV64I-NEXT: slli a6, a6, 24
+; RV64I-NEXT: or a4, a6, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: slli a5, a5, 32
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a1, 0(a1)
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: slli a1, a1, 3
+; RV64I-NEXT: andi a1, a1, 24
+; RV64I-NEXT: mv a0, sp
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ld a1, 16(a0)
+; RV64I-NEXT: ld a3, 24(a0)
+; RV64I-NEXT: ld a4, 0(a0)
+; RV64I-NEXT: ld a0, 8(a0)
+; RV64I-NEXT: sb a1, 16(a2)
+; RV64I-NEXT: sb a3, 24(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a5, a1, 56
+; RV64I-NEXT: sb a5, 23(a2)
+; RV64I-NEXT: srli a5, a1, 48
+; RV64I-NEXT: sb a5, 22(a2)
+; RV64I-NEXT: srli a5, a1, 40
+; RV64I-NEXT: sb a5, 21(a2)
+; RV64I-NEXT: srli a5, a1, 32
+; RV64I-NEXT: sb a5, 20(a2)
+; RV64I-NEXT: srli a5, a1, 24
+; RV64I-NEXT: sb a5, 19(a2)
+; RV64I-NEXT: srli a5, a1, 16
+; RV64I-NEXT: sb a5, 18(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 17(a2)
+; RV64I-NEXT: srli a1, a3, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a3, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a3, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a3, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a3, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a3, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a3, a3, 8
+; RV64I-NEXT: sb a3, 25(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
+; RV64I-NEXT: srli a4, a4, 8
+; RV64I-NEXT: sb a4, 1(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: ashr_32bytes_dwordOff:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
+; RV32I-NEXT: slli a5, a5, 16
+; RV32I-NEXT: slli a6, a6, 24
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or a7, t0, a7
+; RV32I-NEXT: lbu t0, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli t0, t0, 8
+; RV32I-NEXT: or t0, t0, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t0, t1, t0
+; RV32I-NEXT: lbu t1, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli t1, t1, 8
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t1, t2, t1
+; RV32I-NEXT: lbu t2, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli t2, t2, 8
+; RV32I-NEXT: or t2, t2, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t2, t3, t2
+; RV32I-NEXT: lbu a1, 0(a1)
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t2, 28(sp)
+; RV32I-NEXT: sw t1, 24(sp)
+; RV32I-NEXT: sw t0, 20(sp)
+; RV32I-NEXT: sw a7, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: slli a1, a1, 3
+; RV32I-NEXT: andi a1, a1, 24
+; RV32I-NEXT: mv a0, sp
+; RV32I-NEXT: add a3, a0, a1
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a1, 0(a3)
+; RV32I-NEXT: lw a4, 12(a3)
+; RV32I-NEXT: lw a5, 8(a3)
+; RV32I-NEXT: lw a6, 24(a3)
+; RV32I-NEXT: lw a7, 28(a3)
+; RV32I-NEXT: lw t0, 16(a3)
+; RV32I-NEXT: lw a3, 20(a3)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb a7, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb a3, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a4, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli t1, a6, 24
+; RV32I-NEXT: sb t1, 27(a2)
+; RV32I-NEXT: srli t1, a6, 16
+; RV32I-NEXT: sb t1, 26(a2)
+; RV32I-NEXT: srli a6, a6, 8
+; RV32I-NEXT: sb a6, 25(a2)
+; RV32I-NEXT: srli a6, a7, 24
+; RV32I-NEXT: sb a6, 31(a2)
+; RV32I-NEXT: srli a6, a7, 16
+; RV32I-NEXT: sb a6, 30(a2)
+; RV32I-NEXT: srli a6, a7, 8
+; RV32I-NEXT: sb a6, 29(a2)
+; RV32I-NEXT: srli a6, t0, 24
+; RV32I-NEXT: sb a6, 19(a2)
+; RV32I-NEXT: srli a6, t0, 16
+; RV32I-NEXT: sb a6, 18(a2)
+; RV32I-NEXT: srli a6, t0, 8
+; RV32I-NEXT: sb a6, 17(a2)
+; RV32I-NEXT: srli a6, a3, 24
+; RV32I-NEXT: sb a6, 23(a2)
+; RV32I-NEXT: srli a6, a3, 16
+; RV32I-NEXT: sb a6, 22(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 21(a2)
+; RV32I-NEXT: srli a3, a5, 24
+; RV32I-NEXT: sb a3, 11(a2)
+; RV32I-NEXT: srli a3, a5, 16
+; RV32I-NEXT: sb a3, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: sb a3, 14(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: ret
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 6
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index a601256bc2af..7e879b137b4f 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: lshr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 1(a1)
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: or a1, a1, s0
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb s4, 26(sp)
-; RV32I-NEXT: sb s3, 25(sp)
-; RV32I-NEXT: sb s2, 24(sp)
-; RV32I-NEXT: sb t6, 23(sp)
-; RV32I-NEXT: sb t5, 22(sp)
-; RV32I-NEXT: sb t4, 21(sp)
-; RV32I-NEXT: sb t3, 20(sp)
-; RV32I-NEXT: sb t2, 19(sp)
-; RV32I-NEXT: sb t1, 18(sp)
-; RV32I-NEXT: sb t0, 17(sp)
-; RV32I-NEXT: sb a7, 16(sp)
-; RV32I-NEXT: sb a6, 15(sp)
-; RV32I-NEXT: sb a5, 14(sp)
-; RV32I-NEXT: sb a4, 13(sp)
-; RV32I-NEXT: sb a3, 12(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 12
-; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: srl a0, a5, a4
-; RV32I-NEXT: lbu a1, 9(a3)
-; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: slli a1, a6, 1
-; RV32I-NEXT: not a7, a4
-; RV32I-NEXT: sll a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu a7, 1(a3)
-; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
-; RV32I-NEXT: slli a7, a7, 8
-; RV32I-NEXT: or a7, a7, t0
-; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: srl a7, a7, a4
-; RV32I-NEXT: slli a5, a5, 1
-; RV32I-NEXT: xori t0, a4, 31
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: srl a6, a6, a4
-; RV32I-NEXT: lbu t1, 13(a3)
-; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: slli t1, a3, 1
-; RV32I-NEXT: sll t0, t1, t0
-; RV32I-NEXT: or t0, a6, t0
-; RV32I-NEXT: srl a3, a3, a4
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: srli a4, a6, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 9(a2)
-; RV32I-NEXT: srli a4, a3, 16
-; RV32I-NEXT: sb a4, 14(a2)
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: sb a4, 15(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a7, 16
-; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a7, 8
-; RV32I-NEXT: sb a3, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw a0, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a4, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli a7, a0, 1
+; RV32I-NEXT: sll a6, a7, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: srl a0, a0, a1
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a5, 24
; RV32I-NEXT: sb a0, 11(a2)
-; RV32I-NEXT: srli a5, a5, 24
-; RV32I-NEXT: sb a5, 3(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, a4, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a4, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: shl_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: lbu a4, 1(a0)
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
; RV32I-NEXT: lbu a5, 2(a0)
; RV32I-NEXT: lbu a6, 3(a0)
-; RV32I-NEXT: lbu a7, 4(a0)
-; RV32I-NEXT: lbu t0, 5(a0)
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 1(a1)
-; RV32I-NEXT: lbu s1, 0(a1)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: slli s0, s0, 8
-; RV32I-NEXT: or s0, s0, s1
-; RV32I-NEXT: lbu s1, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu a0, 15(a0)
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: or a1, a1, s0
-; RV32I-NEXT: sb zero, 27(sp)
-; RV32I-NEXT: sb zero, 26(sp)
-; RV32I-NEXT: sb zero, 25(sp)
-; RV32I-NEXT: sb zero, 24(sp)
-; RV32I-NEXT: sb zero, 23(sp)
-; RV32I-NEXT: sb zero, 22(sp)
-; RV32I-NEXT: sb zero, 21(sp)
-; RV32I-NEXT: sb zero, 20(sp)
-; RV32I-NEXT: sb zero, 19(sp)
-; RV32I-NEXT: sb zero, 18(sp)
-; RV32I-NEXT: sb zero, 17(sp)
-; RV32I-NEXT: sb zero, 16(sp)
-; RV32I-NEXT: sb zero, 15(sp)
-; RV32I-NEXT: sb zero, 14(sp)
-; RV32I-NEXT: sb zero, 13(sp)
-; RV32I-NEXT: sb zero, 12(sp)
-; RV32I-NEXT: sb a0, 43(sp)
-; RV32I-NEXT: sb s4, 42(sp)
-; RV32I-NEXT: sb s3, 41(sp)
-; RV32I-NEXT: sb s2, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: sb t0, 33(sp)
-; RV32I-NEXT: sb a7, 32(sp)
-; RV32I-NEXT: sb a6, 31(sp)
-; RV32I-NEXT: sb a5, 30(sp)
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a3, 28(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 28
-; RV32I-NEXT: sub a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: sll a0, a5, a4
-; RV32I-NEXT: lbu a1, 1(a3)
-; RV32I-NEXT: lbu a6, 0(a3)
-; RV32I-NEXT: lbu a7, 2(a3)
-; RV32I-NEXT: lbu t0, 3(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: srli a1, a6, 1
-; RV32I-NEXT: xori a7, a4, 31
-; RV32I-NEXT: srl a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu t0, 13(a3)
-; RV32I-NEXT: lbu t1, 12(a3)
-; RV32I-NEXT: lbu t2, 14(a3)
-; RV32I-NEXT: lbu t3, 15(a3)
-; RV32I-NEXT: slli t0, t0, 8
-; RV32I-NEXT: or t0, t0, t1
-; RV32I-NEXT: slli t2, t2, 16
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: or t1, t3, t2
-; RV32I-NEXT: or t0, t1, t0
-; RV32I-NEXT: sll t0, t0, a4
-; RV32I-NEXT: lbu t1, 9(a3)
-; RV32I-NEXT: lbu t2, 8(a3)
-; RV32I-NEXT: lbu t3, 10(a3)
-; RV32I-NEXT: lbu a3, 11(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: srli t1, a3, 1
-; RV32I-NEXT: srl a7, t1, a7
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: sll a3, a3, a4
-; RV32I-NEXT: srli a5, a5, 1
-; RV32I-NEXT: not t1, a4
-; RV32I-NEXT: srl a5, a5, t1
-; RV32I-NEXT: or a5, a3, a5
-; RV32I-NEXT: sll a4, a6, a4
-; RV32I-NEXT: sb a4, 0(a2)
-; RV32I-NEXT: srli a6, a3, 16
-; RV32I-NEXT: sb a6, 10(a2)
-; RV32I-NEXT: srli a6, a3, 24
-; RV32I-NEXT: sb a6, 11(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 9(a2)
-; RV32I-NEXT: srli a3, t0, 16
-; RV32I-NEXT: sb a3, 14(a2)
-; RV32I-NEXT: srli a3, t0, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a3, t0, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a4, 16
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: or a0, a0, a6
+; RV32I-NEXT: lbu a6, 1(a1)
+; RV32I-NEXT: lbu a7, 0(a1)
+; RV32I-NEXT: lbu t0, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t0
+; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a5, 24(sp)
+; RV32I-NEXT: sw a4, 20(sp)
+; RV32I-NEXT: sw a3, 16(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: addi a3, sp, 16
+; RV32I-NEXT: sub a3, a3, a0
+; RV32I-NEXT: lw a0, 4(a3)
+; RV32I-NEXT: lw a4, 0(a3)
+; RV32I-NEXT: sll a5, a0, a1
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: srli a7, a4, 1
+; RV32I-NEXT: lw t0, 12(a3)
+; RV32I-NEXT: lw a3, 8(a3)
+; RV32I-NEXT: srl a7, a7, a6
+; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: sll a7, t0, a1
+; RV32I-NEXT: srli t0, a3, 1
+; RV32I-NEXT: srl t0, t0, a6
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: sll a3, a3, a1
+; RV32I-NEXT: srli a0, a0, 1
+; RV32I-NEXT: srl a0, a0, a6
+; RV32I-NEXT: or a0, a3, a0
+; RV32I-NEXT: sll a1, a4, a1
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a3, a1, 16
; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a4, 24
+; RV32I-NEXT: srli a3, a1, 24
; RV32I-NEXT: sb a3, 3(a2)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
-; RV32I-NEXT: srli a3, a0, 24
-; RV32I-NEXT: sb a3, 7(a2)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: sb a0, 8(a2)
; RV32I-NEXT: sb a7, 12(a2)
-; RV32I-NEXT: sb a1, 4(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: sb a5, 4(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 10(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 11(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 9(a2)
+; RV32I-NEXT: srli a0, a7, 16
+; RV32I-NEXT: sb a0, 14(a2)
+; RV32I-NEXT: srli a0, a7, 24
+; RV32I-NEXT: sb a0, 15(a2)
+; RV32I-NEXT: srli a0, a7, 8
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a5, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; RV32I-LABEL: ashr_16bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -64
-; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 15(a0)
-; RV32I-NEXT: slli a4, a3, 24
-; RV32I-NEXT: lbu a5, 0(a0)
-; RV32I-NEXT: lbu a6, 1(a0)
-; RV32I-NEXT: lbu a7, 2(a0)
-; RV32I-NEXT: lbu t0, 3(a0)
-; RV32I-NEXT: lbu t1, 4(a0)
-; RV32I-NEXT: lbu t2, 5(a0)
-; RV32I-NEXT: lbu t3, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 1(a1)
-; RV32I-NEXT: lbu s2, 0(a1)
-; RV32I-NEXT: lbu s3, 11(a0)
-; RV32I-NEXT: lbu s4, 12(a0)
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: or s1, s1, s2
-; RV32I-NEXT: lbu s2, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: lbu s5, 13(a0)
-; RV32I-NEXT: lbu a0, 14(a0)
-; RV32I-NEXT: slli s2, s2, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, s2
-; RV32I-NEXT: or a1, a1, s1
-; RV32I-NEXT: sb a3, 23(sp)
-; RV32I-NEXT: sb a0, 22(sp)
-; RV32I-NEXT: sb s5, 21(sp)
-; RV32I-NEXT: sb s4, 20(sp)
-; RV32I-NEXT: sb s3, 19(sp)
-; RV32I-NEXT: sb s0, 18(sp)
-; RV32I-NEXT: sb t6, 17(sp)
-; RV32I-NEXT: sb t5, 16(sp)
-; RV32I-NEXT: sb t4, 15(sp)
-; RV32I-NEXT: sb t3, 14(sp)
-; RV32I-NEXT: sb t2, 13(sp)
-; RV32I-NEXT: sb t1, 12(sp)
-; RV32I-NEXT: sb t0, 11(sp)
-; RV32I-NEXT: sb a7, 10(sp)
-; RV32I-NEXT: sb a6, 9(sp)
-; RV32I-NEXT: sb a5, 8(sp)
-; RV32I-NEXT: srai a4, a4, 31
-; RV32I-NEXT: sb a4, 36(sp)
-; RV32I-NEXT: sb a4, 32(sp)
-; RV32I-NEXT: sb a4, 28(sp)
-; RV32I-NEXT: sb a4, 24(sp)
-; RV32I-NEXT: srli a0, a4, 24
-; RV32I-NEXT: sb a0, 39(sp)
-; RV32I-NEXT: srli a3, a4, 16
-; RV32I-NEXT: sb a3, 38(sp)
-; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 37(sp)
-; RV32I-NEXT: sb a0, 35(sp)
-; RV32I-NEXT: sb a3, 34(sp)
-; RV32I-NEXT: sb a4, 33(sp)
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: sb a3, 30(sp)
-; RV32I-NEXT: sb a4, 29(sp)
-; RV32I-NEXT: sb a0, 27(sp)
-; RV32I-NEXT: sb a3, 26(sp)
-; RV32I-NEXT: sb a4, 25(sp)
-; RV32I-NEXT: slli a0, a1, 25
-; RV32I-NEXT: srli a0, a0, 28
-; RV32I-NEXT: addi a3, sp, 8
-; RV32I-NEXT: add a3, a3, a0
-; RV32I-NEXT: lbu a0, 5(a3)
-; RV32I-NEXT: lbu a4, 4(a3)
-; RV32I-NEXT: lbu a5, 6(a3)
-; RV32I-NEXT: lbu a6, 7(a3)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a4
+; RV32I-NEXT: addi sp, sp, -32
+; RV32I-NEXT: lbu a3, 1(a0)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
; RV32I-NEXT: or a4, a6, a5
-; RV32I-NEXT: or a5, a4, a0
-; RV32I-NEXT: andi a4, a1, 7
-; RV32I-NEXT: srl a0, a5, a4
-; RV32I-NEXT: lbu a1, 9(a3)
-; RV32I-NEXT: lbu a6, 8(a3)
-; RV32I-NEXT: lbu a7, 10(a3)
-; RV32I-NEXT: lbu t0, 11(a3)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a6
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: or a6, a6, a1
-; RV32I-NEXT: slli a1, a6, 1
-; RV32I-NEXT: not a7, a4
-; RV32I-NEXT: sll a1, a1, a7
-; RV32I-NEXT: or a1, a0, a1
-; RV32I-NEXT: lbu a7, 1(a3)
-; RV32I-NEXT: lbu t0, 0(a3)
-; RV32I-NEXT: lbu t1, 2(a3)
-; RV32I-NEXT: lbu t2, 3(a3)
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu a0, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a7, a0, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t0, 0(a1)
+; RV32I-NEXT: lbu t1, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
; RV32I-NEXT: slli a7, a7, 8
; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
-; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or t0, t2, t1
-; RV32I-NEXT: or a7, t0, a7
-; RV32I-NEXT: srl a7, a7, a4
-; RV32I-NEXT: slli a5, a5, 1
-; RV32I-NEXT: xori t0, a4, 31
-; RV32I-NEXT: sll a5, a5, t0
-; RV32I-NEXT: or a5, a7, a5
-; RV32I-NEXT: srl a6, a6, a4
-; RV32I-NEXT: lbu t1, 13(a3)
-; RV32I-NEXT: lbu t2, 12(a3)
-; RV32I-NEXT: lbu t3, 14(a3)
-; RV32I-NEXT: lbu a3, 15(a3)
-; RV32I-NEXT: slli t1, t1, 8
-; RV32I-NEXT: or t1, t1, t2
-; RV32I-NEXT: slli t3, t3, 16
-; RV32I-NEXT: slli a3, a3, 24
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: or a3, a3, t1
-; RV32I-NEXT: slli t1, a3, 1
-; RV32I-NEXT: sll t0, t1, t0
-; RV32I-NEXT: or t0, a6, t0
-; RV32I-NEXT: sra a3, a3, a4
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: sb a3, 12(a2)
-; RV32I-NEXT: sb a7, 0(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: srli a4, a6, 16
-; RV32I-NEXT: sb a4, 10(a2)
-; RV32I-NEXT: srli a4, a6, 8
-; RV32I-NEXT: sb a4, 9(a2)
-; RV32I-NEXT: srli a4, a3, 16
-; RV32I-NEXT: sb a4, 14(a2)
-; RV32I-NEXT: srli a4, a3, 24
-; RV32I-NEXT: sb a4, 15(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 13(a2)
-; RV32I-NEXT: srli a3, a7, 16
-; RV32I-NEXT: sb a3, 2(a2)
-; RV32I-NEXT: srli a3, a7, 8
-; RV32I-NEXT: sb a3, 1(a2)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 6(a2)
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t1
+; RV32I-NEXT: or a1, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw a0, 24(sp)
+; RV32I-NEXT: sw a0, 20(sp)
+; RV32I-NEXT: sw a0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a1, 3
+; RV32I-NEXT: andi a0, a0, 12
+; RV32I-NEXT: mv a3, sp
+; RV32I-NEXT: add a0, a3, a0
+; RV32I-NEXT: lw a3, 4(a0)
+; RV32I-NEXT: srl a4, a3, a1
+; RV32I-NEXT: lw a5, 8(a0)
+; RV32I-NEXT: andi a6, a1, 31
+; RV32I-NEXT: xori a6, a6, 31
+; RV32I-NEXT: lw a7, 0(a0)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a4, a4, t0
+; RV32I-NEXT: srl a7, a7, a1
+; RV32I-NEXT: slli a3, a3, 1
+; RV32I-NEXT: lw a0, 12(a0)
+; RV32I-NEXT: sll a3, a3, a6
+; RV32I-NEXT: or a3, a7, a3
+; RV32I-NEXT: srl a5, a5, a1
+; RV32I-NEXT: slli a7, a0, 1
+; RV32I-NEXT: sll a6, a7, a6
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: sra a0, a0, a1
+; RV32I-NEXT: sb a0, 12(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 14(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 15(a2)
; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 5(a2)
-; RV32I-NEXT: srli a0, t0, 24
+; RV32I-NEXT: sb a0, 13(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 0(a2)
+; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: srli a0, a5, 16
+; RV32I-NEXT: sb a0, 10(a2)
+; RV32I-NEXT: srli a0, a5, 24
; RV32I-NEXT: sb a0, 11(a2)
-; RV32I-NEXT: srli a5, a5, 24
-; RV32I-NEXT: sb a5, 3(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a0, a3, 16
+; RV32I-NEXT: sb a0, 2(a2)
+; RV32I-NEXT: srli a0, a3, 24
+; RV32I-NEXT: sb a0, 3(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 1(a2)
+; RV32I-NEXT: srli a0, a4, 16
+; RV32I-NEXT: sb a0, 6(a2)
+; RV32I-NEXT: srli a0, a4, 24
+; RV32I-NEXT: sb a0, 7(a2)
+; RV32I-NEXT: srli a4, a4, 8
+; RV32I-NEXT: sb a4, 5(a2)
+; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: lshr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
-; RV64I-NEXT: lbu s9, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 87(sp)
-; RV64I-NEXT: sb a3, 86(sp)
-; RV64I-NEXT: sb a4, 85(sp)
-; RV64I-NEXT: sb a0, 84(sp)
-; RV64I-NEXT: sb a5, 83(sp)
-; RV64I-NEXT: sb a6, 82(sp)
-; RV64I-NEXT: sb a7, 81(sp)
-; RV64I-NEXT: sb s11, 80(sp)
-; RV64I-NEXT: sb s10, 79(sp)
-; RV64I-NEXT: sb ra, 78(sp)
-; RV64I-NEXT: sb s9, 77(sp)
-; RV64I-NEXT: sb s8, 76(sp)
-; RV64I-NEXT: sb s7, 75(sp)
-; RV64I-NEXT: sb s6, 74(sp)
-; RV64I-NEXT: sb s5, 73(sp)
-; RV64I-NEXT: sb s4, 72(sp)
-; RV64I-NEXT: sb s3, 71(sp)
-; RV64I-NEXT: sb s2, 70(sp)
-; RV64I-NEXT: sb s1, 69(sp)
-; RV64I-NEXT: sb s0, 68(sp)
-; RV64I-NEXT: sb t6, 67(sp)
-; RV64I-NEXT: sb t5, 66(sp)
-; RV64I-NEXT: sb t4, 65(sp)
-; RV64I-NEXT: sb zero, 119(sp)
-; RV64I-NEXT: sb zero, 118(sp)
-; RV64I-NEXT: sb zero, 117(sp)
-; RV64I-NEXT: sb zero, 116(sp)
-; RV64I-NEXT: sb zero, 115(sp)
-; RV64I-NEXT: sb zero, 114(sp)
-; RV64I-NEXT: sb zero, 113(sp)
-; RV64I-NEXT: sb zero, 112(sp)
-; RV64I-NEXT: sb zero, 111(sp)
-; RV64I-NEXT: sb zero, 110(sp)
-; RV64I-NEXT: sb zero, 109(sp)
-; RV64I-NEXT: sb zero, 108(sp)
-; RV64I-NEXT: sb zero, 107(sp)
-; RV64I-NEXT: sb zero, 106(sp)
-; RV64I-NEXT: sb zero, 105(sp)
-; RV64I-NEXT: sb zero, 104(sp)
-; RV64I-NEXT: sb zero, 103(sp)
-; RV64I-NEXT: sb zero, 102(sp)
-; RV64I-NEXT: sb zero, 101(sp)
-; RV64I-NEXT: sb zero, 100(sp)
-; RV64I-NEXT: sb zero, 99(sp)
-; RV64I-NEXT: sb zero, 98(sp)
-; RV64I-NEXT: sb zero, 97(sp)
-; RV64I-NEXT: sb zero, 96(sp)
-; RV64I-NEXT: sb zero, 95(sp)
-; RV64I-NEXT: sb zero, 94(sp)
-; RV64I-NEXT: sb zero, 93(sp)
-; RV64I-NEXT: sb zero, 92(sp)
-; RV64I-NEXT: sb zero, 91(sp)
-; RV64I-NEXT: sb zero, 90(sp)
-; RV64I-NEXT: sb zero, 89(sp)
-; RV64I-NEXT: sb zero, 88(sp)
-; RV64I-NEXT: sb t3, 64(sp)
-; RV64I-NEXT: sb t2, 63(sp)
-; RV64I-NEXT: sb t1, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a3, sp, 56
-; RV64I-NEXT: add a3, a3, a0
-; RV64I-NEXT: lbu a0, 9(a3)
-; RV64I-NEXT: lbu a1, 8(a3)
-; RV64I-NEXT: lbu a4, 10(a3)
-; RV64I-NEXT: lbu a5, 11(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a1
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a1, 13(a3)
-; RV64I-NEXT: lbu a4, 12(a3)
-; RV64I-NEXT: lbu a5, 14(a3)
-; RV64I-NEXT: lbu a6, 15(a3)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a4
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
+; RV64I-NEXT: slli a3, a3, 8
+; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a4, a1, a0
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a0, 17(a3)
-; RV64I-NEXT: lbu a5, 16(a3)
-; RV64I-NEXT: lbu a6, 18(a3)
-; RV64I-NEXT: lbu a7, 19(a3)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a3)
-; RV64I-NEXT: lbu a6, 20(a3)
-; RV64I-NEXT: lbu a7, 22(a3)
-; RV64I-NEXT: lbu t0, 23(a3)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a1
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a3)
-; RV64I-NEXT: lbu a7, 0(a3)
-; RV64I-NEXT: lbu t0, 2(a3)
-; RV64I-NEXT: lbu t1, 3(a3)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a3)
-; RV64I-NEXT: lbu t0, 4(a3)
-; RV64I-NEXT: lbu t1, 6(a3)
-; RV64I-NEXT: lbu t2, 7(a3)
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a3)
-; RV64I-NEXT: lbu t0, 24(a3)
-; RV64I-NEXT: lbu t1, 26(a3)
-; RV64I-NEXT: lbu t2, 27(a3)
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a3)
-; RV64I-NEXT: lbu t1, 28(a3)
-; RV64I-NEXT: lbu t2, 30(a3)
-; RV64I-NEXT: lbu a3, 31(a3)
-; RV64I-NEXT: slli t0, t0, 8
-; RV64I-NEXT: or t0, t0, t1
-; RV64I-NEXT: slli t2, t2, 16
-; RV64I-NEXT: slli a3, a3, 24
-; RV64I-NEXT: or a3, a3, t2
-; RV64I-NEXT: slli t1, a4, 1
-; RV64I-NEXT: or a3, a3, t0
-; RV64I-NEXT: xori t0, a1, 63
-; RV64I-NEXT: sll t1, t1, t0
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a7, a3, a7
-; RV64I-NEXT: slli a3, a7, 1
-; RV64I-NEXT: sll t0, a3, t0
-; RV64I-NEXT: srl a3, a4, a1
-; RV64I-NEXT: srl a4, a6, a1
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 56(sp)
+; RV64I-NEXT: sd zero, 48(sp)
+; RV64I-NEXT: sd zero, 40(sp)
+; RV64I-NEXT: sd zero, 32(sp)
+; RV64I-NEXT: sd a0, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: srl a0, a4, a1
+; RV64I-NEXT: ld a5, 16(a3)
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: ld a7, 0(a3)
+; RV64I-NEXT: slli t0, a5, 1
+; RV64I-NEXT: sll t0, t0, a6
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: srl a7, a7, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a6
+; RV64I-NEXT: or a4, a7, a4
; RV64I-NEXT: srl a5, a5, a1
-; RV64I-NEXT: srl a1, a7, a1
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
-; RV64I-NEXT: sb a5, 16(a2)
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 26(a2)
+; RV64I-NEXT: slli a7, a3, 1
+; RV64I-NEXT: sll a6, a7, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: srl a1, a3, a1
; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
; RV64I-NEXT: srli a1, a1, 8
; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a5, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a5, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a5, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a5, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a5, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a5, 16
+; RV64I-NEXT: sb a1, 18(a2)
+; RV64I-NEXT: srli a5, a5, 8
+; RV64I-NEXT: sb a5, 17(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
; RV64I-NEXT: srli a1, a4, 48
; RV64I-NEXT: sb a1, 6(a2)
; RV64I-NEXT: srli a1, a4, 40
@@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: sb a1, 3(a2)
; RV64I-NEXT: srli a1, a4, 16
; RV64I-NEXT: sb a1, 2(a2)
-; RV64I-NEXT: or a1, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
; RV64I-NEXT: srli a4, a4, 8
; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a3, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a3, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a3, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a3, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a3, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
-; RV64I-NEXT: sb a3, 8(a2)
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: srli a3, a6, 56
-; RV64I-NEXT: sb a3, 23(a2)
-; RV64I-NEXT: srli a1, a1, 56
-; RV64I-NEXT: sb a1, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: lshr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s10, 1(a1)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or s10, s10, s11
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: or t0, a1, s10
-; RV32I-NEXT: lbu s10, 24(a0)
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu a6, 26(a0)
-; RV32I-NEXT: lbu a5, 27(a0)
-; RV32I-NEXT: lbu a1, 31(a0)
-; RV32I-NEXT: lbu a3, 30(a0)
-; RV32I-NEXT: lbu a4, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: sb a1, 59(sp)
-; RV32I-NEXT: sb a3, 58(sp)
-; RV32I-NEXT: sb a4, 57(sp)
-; RV32I-NEXT: sb a0, 56(sp)
-; RV32I-NEXT: sb a5, 55(sp)
-; RV32I-NEXT: sb a6, 54(sp)
-; RV32I-NEXT: sb a7, 53(sp)
-; RV32I-NEXT: sb s10, 52(sp)
-; RV32I-NEXT: sb ra, 51(sp)
-; RV32I-NEXT: sb s11, 50(sp)
-; RV32I-NEXT: sb s9, 49(sp)
-; RV32I-NEXT: sb s8, 48(sp)
-; RV32I-NEXT: sb s7, 47(sp)
-; RV32I-NEXT: sb s6, 46(sp)
-; RV32I-NEXT: sb s5, 45(sp)
-; RV32I-NEXT: sb s4, 44(sp)
-; RV32I-NEXT: sb zero, 91(sp)
-; RV32I-NEXT: sb zero, 90(sp)
-; RV32I-NEXT: sb zero, 89(sp)
-; RV32I-NEXT: sb zero, 88(sp)
-; RV32I-NEXT: sb zero, 87(sp)
-; RV32I-NEXT: sb zero, 86(sp)
-; RV32I-NEXT: sb zero, 85(sp)
-; RV32I-NEXT: sb zero, 84(sp)
-; RV32I-NEXT: sb zero, 83(sp)
-; RV32I-NEXT: sb zero, 82(sp)
-; RV32I-NEXT: sb zero, 81(sp)
-; RV32I-NEXT: sb zero, 80(sp)
-; RV32I-NEXT: sb zero, 79(sp)
-; RV32I-NEXT: sb zero, 78(sp)
-; RV32I-NEXT: sb zero, 77(sp)
-; RV32I-NEXT: sb zero, 76(sp)
-; RV32I-NEXT: sb zero, 75(sp)
-; RV32I-NEXT: sb zero, 74(sp)
-; RV32I-NEXT: sb zero, 73(sp)
-; RV32I-NEXT: sb zero, 72(sp)
-; RV32I-NEXT: sb zero, 71(sp)
-; RV32I-NEXT: sb zero, 70(sp)
-; RV32I-NEXT: sb zero, 69(sp)
-; RV32I-NEXT: sb zero, 68(sp)
-; RV32I-NEXT: sb zero, 67(sp)
-; RV32I-NEXT: sb zero, 66(sp)
-; RV32I-NEXT: sb zero, 65(sp)
-; RV32I-NEXT: sb zero, 64(sp)
-; RV32I-NEXT: sb zero, 63(sp)
-; RV32I-NEXT: sb zero, 62(sp)
-; RV32I-NEXT: sb zero, 61(sp)
-; RV32I-NEXT: sb zero, 60(sp)
-; RV32I-NEXT: sb s3, 43(sp)
-; RV32I-NEXT: sb s2, 42(sp)
-; RV32I-NEXT: sb s1, 41(sp)
-; RV32I-NEXT: sb s0, 40(sp)
-; RV32I-NEXT: sb t6, 39(sp)
-; RV32I-NEXT: sb t5, 38(sp)
-; RV32I-NEXT: sb t4, 37(sp)
-; RV32I-NEXT: sb t3, 36(sp)
-; RV32I-NEXT: sb t2, 35(sp)
-; RV32I-NEXT: sb t1, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: slli a0, t0, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 28
-; RV32I-NEXT: add a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a3, t0, 7
-; RV32I-NEXT: lbu a0, 9(a4)
-; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
-; RV32I-NEXT: or a6, a1, a0
-; RV32I-NEXT: slli a0, a6, 1
-; RV32I-NEXT: not t1, a3
-; RV32I-NEXT: sll a0, a0, t1
-; RV32I-NEXT: lbu a1, 1(a4)
-; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or t0, a5, a1
-; RV32I-NEXT: slli a1, t5, 1
-; RV32I-NEXT: xori t2, a3, 31
-; RV32I-NEXT: sll a1, a1, t2
-; RV32I-NEXT: lbu a5, 13(a4)
-; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t3, a7, a5
-; RV32I-NEXT: lbu a5, 17(a4)
-; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
-; RV32I-NEXT: or t4, a7, a5
-; RV32I-NEXT: slli a5, t4, 1
-; RV32I-NEXT: sll a7, a5, t1
-; RV32I-NEXT: lbu a5, 21(a4)
-; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a5
-; RV32I-NEXT: lbu a5, 25(a4)
-; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: or t6, t6, a5
-; RV32I-NEXT: lbu a5, 29(a4)
-; RV32I-NEXT: lbu s1, 28(a4)
-; RV32I-NEXT: slli s2, t6, 1
-; RV32I-NEXT: sll t1, s2, t1
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: slli s1, s0, 1
-; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: or s3, a4, a5
-; RV32I-NEXT: slli a4, s3, 1
-; RV32I-NEXT: sll t2, a4, t2
-; RV32I-NEXT: srl a4, t5, a3
-; RV32I-NEXT: srl a5, t0, a3
-; RV32I-NEXT: srl t0, t3, a3
-; RV32I-NEXT: srl a6, a6, a3
-; RV32I-NEXT: srl t3, s0, a3
-; RV32I-NEXT: srl t4, t4, a3
-; RV32I-NEXT: srl t5, t6, a3
-; RV32I-NEXT: srl a3, s3, a3
-; RV32I-NEXT: srli t6, t5, 16
-; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: or t2, t5, t2
-; RV32I-NEXT: sb t5, 24(a2)
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: sb t5, 25(a2)
-; RV32I-NEXT: srli t5, a3, 24
-; RV32I-NEXT: sb t5, 31(a2)
-; RV32I-NEXT: srli t5, a3, 16
-; RV32I-NEXT: sb t5, 30(a2)
-; RV32I-NEXT: sb a3, 28(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 29(a2)
-; RV32I-NEXT: srli a3, t4, 16
-; RV32I-NEXT: sb a3, 18(a2)
-; RV32I-NEXT: or a3, t4, s1
-; RV32I-NEXT: sb t4, 16(a2)
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb t4, 17(a2)
-; RV32I-NEXT: srli t4, t3, 16
-; RV32I-NEXT: sb t4, 22(a2)
-; RV32I-NEXT: or t1, t3, t1
-; RV32I-NEXT: sb t3, 20(a2)
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, a6, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, a6, s2
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: srli a6, a6, 8
-; RV32I-NEXT: sb a6, 9(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: sb a6, 14(a2)
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: srli a7, t0, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a5, 16
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: sb a5, 0(a2)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 1(a2)
-; RV32I-NEXT: srli a5, a4, 16
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: sw zero, 60(sp)
+; RV32I-NEXT: sw zero, 56(sp)
+; RV32I-NEXT: sw zero, 52(sp)
+; RV32I-NEXT: sw zero, 48(sp)
+; RV32I-NEXT: sw zero, 44(sp)
+; RV32I-NEXT: sw zero, 40(sp)
+; RV32I-NEXT: sw zero, 36(sp)
+; RV32I-NEXT: sw zero, 32(sp)
+; RV32I-NEXT: sw a0, 28(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: mv a1, sp
+; RV32I-NEXT: add a4, a1, a0
+; RV32I-NEXT: lw a1, 4(a4)
+; RV32I-NEXT: srl a0, a1, a7
+; RV32I-NEXT: lw a5, 8(a4)
+; RV32I-NEXT: andi a3, a7, 31
+; RV32I-NEXT: xori a6, a3, 31
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 16(a4)
+; RV32I-NEXT: sll a1, a1, a6
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srl a3, t0, a7
+; RV32I-NEXT: slli t2, t1, 1
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or a3, a3, t2
+; RV32I-NEXT: srl a5, a5, a7
+; RV32I-NEXT: slli t0, t0, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 24(a4)
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a5, a5, t0
+; RV32I-NEXT: srl t0, t2, a7
+; RV32I-NEXT: slli t4, t3, 1
+; RV32I-NEXT: sll t4, t4, a6
+; RV32I-NEXT: or t0, t0, t4
+; RV32I-NEXT: srl t1, t1, a7
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: srl t2, t3, a7
+; RV32I-NEXT: slli t3, a4, 1
+; RV32I-NEXT: sll a6, t3, a6
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: srl a4, a4, a7
+; RV32I-NEXT: sb a4, 28(a2)
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: sb a7, 30(a2)
; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(a2)
-; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb t1, 16(a2)
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, a6, 24
; RV32I-NEXT: sb a4, 27(a2)
-; RV32I-NEXT: srli a3, a3, 24
-; RV32I-NEXT: sb a3, 19(a2)
-; RV32I-NEXT: srli a3, t1, 24
-; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: srli a3, t3, 24
-; RV32I-NEXT: sb a3, 11(a2)
-; RV32I-NEXT: srli a3, a6, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 3(a2)
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a4, a6, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, a6, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: shl_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 6(a0)
-; RV64I-NEXT: lbu t2, 7(a0)
-; RV64I-NEXT: lbu t3, 8(a0)
-; RV64I-NEXT: lbu t4, 9(a0)
-; RV64I-NEXT: lbu t5, 10(a0)
-; RV64I-NEXT: lbu t6, 11(a0)
-; RV64I-NEXT: lbu s0, 12(a0)
-; RV64I-NEXT: lbu s1, 13(a0)
-; RV64I-NEXT: lbu s2, 14(a0)
-; RV64I-NEXT: lbu s3, 15(a0)
-; RV64I-NEXT: lbu s4, 16(a0)
-; RV64I-NEXT: lbu s5, 17(a0)
-; RV64I-NEXT: lbu s6, 18(a0)
-; RV64I-NEXT: lbu s7, 19(a0)
-; RV64I-NEXT: lbu s8, 20(a0)
-; RV64I-NEXT: lbu s9, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli s9, s9, 8
-; RV64I-NEXT: or s9, s9, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or s11, s11, s9
-; RV64I-NEXT: lbu s9, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s9
-; RV64I-NEXT: lbu s9, 21(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 22(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 23(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t0, a1, s11
-; RV64I-NEXT: lbu s11, 24(a0)
-; RV64I-NEXT: lbu a7, 25(a0)
-; RV64I-NEXT: lbu a6, 26(a0)
-; RV64I-NEXT: lbu a5, 27(a0)
-; RV64I-NEXT: lbu a1, 31(a0)
-; RV64I-NEXT: lbu a3, 30(a0)
-; RV64I-NEXT: lbu a4, 29(a0)
-; RV64I-NEXT: lbu a0, 28(a0)
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: sb a0, 116(sp)
-; RV64I-NEXT: sb a5, 115(sp)
-; RV64I-NEXT: sb a6, 114(sp)
-; RV64I-NEXT: sb a7, 113(sp)
-; RV64I-NEXT: sb s11, 112(sp)
-; RV64I-NEXT: sb s10, 111(sp)
-; RV64I-NEXT: sb ra, 110(sp)
-; RV64I-NEXT: sb s9, 109(sp)
-; RV64I-NEXT: sb s8, 108(sp)
-; RV64I-NEXT: sb s7, 107(sp)
-; RV64I-NEXT: sb s6, 106(sp)
-; RV64I-NEXT: sb s5, 105(sp)
-; RV64I-NEXT: sb s4, 104(sp)
-; RV64I-NEXT: sb s3, 103(sp)
-; RV64I-NEXT: sb s2, 102(sp)
-; RV64I-NEXT: sb s1, 101(sp)
-; RV64I-NEXT: sb s0, 100(sp)
-; RV64I-NEXT: sb t6, 99(sp)
-; RV64I-NEXT: sb t5, 98(sp)
-; RV64I-NEXT: sb t4, 97(sp)
-; RV64I-NEXT: sb t3, 96(sp)
-; RV64I-NEXT: sb zero, 87(sp)
-; RV64I-NEXT: sb zero, 86(sp)
-; RV64I-NEXT: sb zero, 85(sp)
-; RV64I-NEXT: sb zero, 84(sp)
-; RV64I-NEXT: sb zero, 83(sp)
-; RV64I-NEXT: sb zero, 82(sp)
-; RV64I-NEXT: sb zero, 81(sp)
-; RV64I-NEXT: sb zero, 80(sp)
-; RV64I-NEXT: sb zero, 79(sp)
-; RV64I-NEXT: sb zero, 78(sp)
-; RV64I-NEXT: sb zero, 77(sp)
-; RV64I-NEXT: sb zero, 76(sp)
-; RV64I-NEXT: sb zero, 75(sp)
-; RV64I-NEXT: sb zero, 74(sp)
-; RV64I-NEXT: sb zero, 73(sp)
-; RV64I-NEXT: sb zero, 72(sp)
-; RV64I-NEXT: sb zero, 71(sp)
-; RV64I-NEXT: sb zero, 70(sp)
-; RV64I-NEXT: sb zero, 69(sp)
-; RV64I-NEXT: sb zero, 68(sp)
-; RV64I-NEXT: sb zero, 67(sp)
-; RV64I-NEXT: sb zero, 66(sp)
-; RV64I-NEXT: sb zero, 65(sp)
-; RV64I-NEXT: sb zero, 64(sp)
-; RV64I-NEXT: sb zero, 63(sp)
-; RV64I-NEXT: sb zero, 62(sp)
-; RV64I-NEXT: sb zero, 61(sp)
-; RV64I-NEXT: sb zero, 60(sp)
-; RV64I-NEXT: sb zero, 59(sp)
-; RV64I-NEXT: sb zero, 58(sp)
-; RV64I-NEXT: sb zero, 57(sp)
-; RV64I-NEXT: sb zero, 56(sp)
-; RV64I-NEXT: sb t2, 95(sp)
-; RV64I-NEXT: sb t1, 94(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 93(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 92(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 91(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 90(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: slli a0, t0, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 88
-; RV64I-NEXT: sub a0, a1, a0
-; RV64I-NEXT: lbu a1, 9(a0)
-; RV64I-NEXT: lbu a3, 8(a0)
-; RV64I-NEXT: lbu a4, 10(a0)
-; RV64I-NEXT: lbu a5, 11(a0)
-; RV64I-NEXT: slli a1, a1, 8
-; RV64I-NEXT: or a1, a1, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a1, a4, a1
-; RV64I-NEXT: lbu a3, 13(a0)
-; RV64I-NEXT: lbu a4, 12(a0)
-; RV64I-NEXT: lbu a5, 14(a0)
-; RV64I-NEXT: lbu a6, 15(a0)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a3, a3, a1
-; RV64I-NEXT: andi a1, t0, 7
-; RV64I-NEXT: lbu a4, 1(a0)
-; RV64I-NEXT: lbu a5, 0(a0)
-; RV64I-NEXT: lbu a6, 2(a0)
-; RV64I-NEXT: lbu a7, 3(a0)
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
; RV64I-NEXT: slli a4, a4, 8
; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 5(a0)
-; RV64I-NEXT: lbu a6, 4(a0)
-; RV64I-NEXT: lbu a7, 6(a0)
-; RV64I-NEXT: lbu t0, 7(a0)
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: lbu a5, 25(a0)
-; RV64I-NEXT: lbu a6, 24(a0)
-; RV64I-NEXT: lbu a7, 26(a0)
-; RV64I-NEXT: lbu t0, 27(a0)
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
; RV64I-NEXT: slli t0, t0, 24
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 29(a0)
-; RV64I-NEXT: lbu a7, 28(a0)
-; RV64I-NEXT: lbu t0, 30(a0)
-; RV64I-NEXT: lbu t1, 31(a0)
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
@@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, a7, a6
; RV64I-NEXT: slli a6, a6, 32
; RV64I-NEXT: or a5, a6, a5
-; RV64I-NEXT: lbu a6, 17(a0)
-; RV64I-NEXT: lbu a7, 16(a0)
-; RV64I-NEXT: lbu t0, 18(a0)
-; RV64I-NEXT: lbu t1, 19(a0)
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
-; RV64I-NEXT: lbu a7, 21(a0)
-; RV64I-NEXT: or t0, t1, t0
-; RV64I-NEXT: or a6, t0, a6
-; RV64I-NEXT: lbu t0, 20(a0)
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
-; RV64I-NEXT: lbu t1, 22(a0)
-; RV64I-NEXT: lbu a0, 23(a0)
; RV64I-NEXT: or a7, a7, t0
-; RV64I-NEXT: srli t0, a4, 1
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli a0, a0, 24
-; RV64I-NEXT: or t1, a0, t1
-; RV64I-NEXT: xori t2, a1, 63
-; RV64I-NEXT: srl a0, t0, t2
-; RV64I-NEXT: or a7, t1, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: or a0, a0, a6
+; RV64I-NEXT: lbu a6, 1(a1)
+; RV64I-NEXT: lbu a7, 0(a1)
+; RV64I-NEXT: lbu t0, 2(a1)
+; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: srli a7, a6, 1
-; RV64I-NEXT: srl a7, a7, t2
+; RV64I-NEXT: lbu a7, 5(a1)
+; RV64I-NEXT: lbu t0, 4(a1)
+; RV64I-NEXT: lbu t1, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
+; RV64I-NEXT: slli a7, a7, 8
+; RV64I-NEXT: or a7, a7, t0
+; RV64I-NEXT: slli t1, t1, 16
+; RV64I-NEXT: slli a1, a1, 24
+; RV64I-NEXT: or a1, a1, t1
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: slli a1, a1, 32
+; RV64I-NEXT: or a1, a1, a6
+; RV64I-NEXT: sd zero, 24(sp)
+; RV64I-NEXT: sd zero, 16(sp)
+; RV64I-NEXT: sd zero, 8(sp)
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a5, 48(sp)
+; RV64I-NEXT: sd a4, 40(sp)
+; RV64I-NEXT: sd a3, 32(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: addi a3, sp, 32
+; RV64I-NEXT: sub a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: ld a5, 0(a3)
+; RV64I-NEXT: sll a0, a4, a1
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: srli a7, a5, 1
+; RV64I-NEXT: ld t0, 24(a3)
+; RV64I-NEXT: ld a3, 16(a3)
+; RV64I-NEXT: srl a7, a7, a6
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: sll a7, t0, a1
; RV64I-NEXT: srli t0, a3, 1
-; RV64I-NEXT: not t1, a1
-; RV64I-NEXT: srl t0, t0, t1
+; RV64I-NEXT: srl t0, t0, a6
+; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: sll a3, a3, a1
-; RV64I-NEXT: sll a5, a5, a1
-; RV64I-NEXT: sll a6, a6, a1
-; RV64I-NEXT: sll a1, a4, a1
-; RV64I-NEXT: srli a4, a6, 56
-; RV64I-NEXT: sb a4, 23(a2)
-; RV64I-NEXT: srli a4, a6, 48
-; RV64I-NEXT: sb a4, 22(a2)
-; RV64I-NEXT: srli a4, a6, 40
-; RV64I-NEXT: sb a4, 21(a2)
-; RV64I-NEXT: srli a4, a6, 32
-; RV64I-NEXT: sb a4, 20(a2)
-; RV64I-NEXT: srli a4, a6, 24
-; RV64I-NEXT: sb a4, 19(a2)
-; RV64I-NEXT: srli a4, a6, 16
-; RV64I-NEXT: sb a4, 18(a2)
-; RV64I-NEXT: or a4, a6, t0
-; RV64I-NEXT: srli a6, a6, 8
-; RV64I-NEXT: sb a6, 17(a2)
-; RV64I-NEXT: srli a6, a5, 56
-; RV64I-NEXT: sb a6, 31(a2)
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 30(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 29(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 28(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 27(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 26(a2)
-; RV64I-NEXT: or a6, a5, a7
-; RV64I-NEXT: srli a5, a5, 8
-; RV64I-NEXT: sb a5, 25(a2)
-; RV64I-NEXT: srli a5, a1, 56
-; RV64I-NEXT: sb a5, 7(a2)
-; RV64I-NEXT: srli a5, a1, 48
-; RV64I-NEXT: sb a5, 6(a2)
-; RV64I-NEXT: srli a5, a1, 40
-; RV64I-NEXT: sb a5, 5(a2)
-; RV64I-NEXT: srli a5, a1, 32
-; RV64I-NEXT: sb a5, 4(a2)
-; RV64I-NEXT: srli a5, a1, 24
-; RV64I-NEXT: sb a5, 3(a2)
-; RV64I-NEXT: srli a5, a1, 16
-; RV64I-NEXT: sb a5, 2(a2)
+; RV64I-NEXT: srli a4, a4, 1
+; RV64I-NEXT: srl a4, a4, a6
+; RV64I-NEXT: or a3, a3, a4
+; RV64I-NEXT: sll a1, a5, a1
; RV64I-NEXT: sb a1, 0(a2)
+; RV64I-NEXT: srli a4, a1, 56
+; RV64I-NEXT: sb a4, 7(a2)
+; RV64I-NEXT: srli a4, a1, 48
+; RV64I-NEXT: sb a4, 6(a2)
+; RV64I-NEXT: srli a4, a1, 40
+; RV64I-NEXT: sb a4, 5(a2)
+; RV64I-NEXT: srli a4, a1, 32
+; RV64I-NEXT: sb a4, 4(a2)
+; RV64I-NEXT: srli a4, a1, 24
+; RV64I-NEXT: sb a4, 3(a2)
+; RV64I-NEXT: srli a4, a1, 16
+; RV64I-NEXT: sb a4, 2(a2)
; RV64I-NEXT: srli a1, a1, 8
; RV64I-NEXT: sb a1, 1(a2)
+; RV64I-NEXT: sb a3, 16(a2)
+; RV64I-NEXT: sb a7, 24(a2)
+; RV64I-NEXT: sb a0, 8(a2)
; RV64I-NEXT: srli a1, a3, 56
-; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: sb a1, 23(a2)
; RV64I-NEXT: srli a1, a3, 48
-; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: sb a1, 22(a2)
; RV64I-NEXT: srli a1, a3, 40
-; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: sb a1, 21(a2)
; RV64I-NEXT: srli a1, a3, 32
-; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: sb a1, 20(a2)
; RV64I-NEXT: srli a1, a3, 24
-; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: sb a1, 19(a2)
; RV64I-NEXT: srli a1, a3, 16
-; RV64I-NEXT: sb a1, 10(a2)
-; RV64I-NEXT: or a0, a3, a0
+; RV64I-NEXT: sb a1, 18(a2)
; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 9(a2)
-; RV64I-NEXT: sb a4, 16(a2)
-; RV64I-NEXT: sb a6, 24(a2)
-; RV64I-NEXT: sb a0, 8(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: sb a3, 17(a2)
+; RV64I-NEXT: srli a1, a7, 56
+; RV64I-NEXT: sb a1, 31(a2)
+; RV64I-NEXT: srli a1, a7, 48
+; RV64I-NEXT: sb a1, 30(a2)
+; RV64I-NEXT: srli a1, a7, 40
+; RV64I-NEXT: sb a1, 29(a2)
+; RV64I-NEXT: srli a1, a7, 32
+; RV64I-NEXT: sb a1, 28(a2)
+; RV64I-NEXT: srli a1, a7, 24
+; RV64I-NEXT: sb a1, 27(a2)
+; RV64I-NEXT: srli a1, a7, 16
+; RV64I-NEXT: sb a1, 26(a2)
+; RV64I-NEXT: srli a1, a7, 8
+; RV64I-NEXT: sb a1, 25(a2)
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: shl_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t1, 6(a0)
-; RV32I-NEXT: lbu t2, 7(a0)
-; RV32I-NEXT: lbu t3, 8(a0)
-; RV32I-NEXT: lbu t4, 9(a0)
-; RV32I-NEXT: lbu t5, 10(a0)
-; RV32I-NEXT: lbu t6, 11(a0)
-; RV32I-NEXT: lbu s0, 12(a0)
-; RV32I-NEXT: lbu s1, 13(a0)
-; RV32I-NEXT: lbu s2, 14(a0)
-; RV32I-NEXT: lbu s3, 15(a0)
-; RV32I-NEXT: lbu s4, 16(a0)
-; RV32I-NEXT: lbu s5, 17(a0)
-; RV32I-NEXT: lbu s6, 18(a0)
-; RV32I-NEXT: lbu s7, 19(a0)
-; RV32I-NEXT: lbu s10, 1(a1)
-; RV32I-NEXT: lbu s8, 20(a0)
-; RV32I-NEXT: lbu s9, 21(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
-; RV32I-NEXT: slli s10, s10, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or s10, s10, s11
-; RV32I-NEXT: lbu s11, 22(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 23(a0)
-; RV32I-NEXT: or t0, a1, s10
-; RV32I-NEXT: lbu s10, 24(a0)
-; RV32I-NEXT: lbu a7, 25(a0)
-; RV32I-NEXT: lbu a6, 26(a0)
-; RV32I-NEXT: lbu a5, 27(a0)
-; RV32I-NEXT: lbu a1, 31(a0)
-; RV32I-NEXT: lbu a3, 30(a0)
-; RV32I-NEXT: lbu a4, 29(a0)
-; RV32I-NEXT: lbu a0, 28(a0)
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: sb a4, 89(sp)
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a5, 87(sp)
-; RV32I-NEXT: sb a6, 86(sp)
-; RV32I-NEXT: sb a7, 85(sp)
-; RV32I-NEXT: sb s10, 84(sp)
-; RV32I-NEXT: sb ra, 83(sp)
-; RV32I-NEXT: sb s11, 82(sp)
-; RV32I-NEXT: sb s9, 81(sp)
-; RV32I-NEXT: sb s8, 80(sp)
-; RV32I-NEXT: sb s7, 79(sp)
-; RV32I-NEXT: sb s6, 78(sp)
-; RV32I-NEXT: sb s5, 77(sp)
-; RV32I-NEXT: sb s4, 76(sp)
-; RV32I-NEXT: sb zero, 59(sp)
-; RV32I-NEXT: sb zero, 58(sp)
-; RV32I-NEXT: sb zero, 57(sp)
-; RV32I-NEXT: sb zero, 56(sp)
-; RV32I-NEXT: sb zero, 55(sp)
-; RV32I-NEXT: sb zero, 54(sp)
-; RV32I-NEXT: sb zero, 53(sp)
-; RV32I-NEXT: sb zero, 52(sp)
-; RV32I-NEXT: sb zero, 51(sp)
-; RV32I-NEXT: sb zero, 50(sp)
-; RV32I-NEXT: sb zero, 49(sp)
-; RV32I-NEXT: sb zero, 48(sp)
-; RV32I-NEXT: sb zero, 47(sp)
-; RV32I-NEXT: sb zero, 46(sp)
-; RV32I-NEXT: sb zero, 45(sp)
-; RV32I-NEXT: sb zero, 44(sp)
-; RV32I-NEXT: sb zero, 43(sp)
-; RV32I-NEXT: sb zero, 42(sp)
-; RV32I-NEXT: sb zero, 41(sp)
-; RV32I-NEXT: sb zero, 40(sp)
-; RV32I-NEXT: sb zero, 39(sp)
-; RV32I-NEXT: sb zero, 38(sp)
-; RV32I-NEXT: sb zero, 37(sp)
-; RV32I-NEXT: sb zero, 36(sp)
-; RV32I-NEXT: sb zero, 35(sp)
-; RV32I-NEXT: sb zero, 34(sp)
-; RV32I-NEXT: sb zero, 33(sp)
-; RV32I-NEXT: sb zero, 32(sp)
-; RV32I-NEXT: sb zero, 31(sp)
-; RV32I-NEXT: sb zero, 30(sp)
-; RV32I-NEXT: sb zero, 29(sp)
-; RV32I-NEXT: sb zero, 28(sp)
-; RV32I-NEXT: sb s3, 75(sp)
-; RV32I-NEXT: sb s2, 74(sp)
-; RV32I-NEXT: sb s1, 73(sp)
-; RV32I-NEXT: sb s0, 72(sp)
-; RV32I-NEXT: sb t6, 71(sp)
-; RV32I-NEXT: sb t5, 70(sp)
-; RV32I-NEXT: sb t4, 69(sp)
-; RV32I-NEXT: sb t3, 68(sp)
-; RV32I-NEXT: sb t2, 67(sp)
-; RV32I-NEXT: sb t1, 66(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 63(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 62(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: slli a0, t0, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 60
-; RV32I-NEXT: sub a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a1, t0, 7
-; RV32I-NEXT: lbu a0, 1(a4)
-; RV32I-NEXT: lbu a3, 0(a4)
-; RV32I-NEXT: lbu a5, 2(a4)
-; RV32I-NEXT: lbu a6, 3(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a3
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
+; RV32I-NEXT: slli a3, a3, 8
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a3, a6, a5
-; RV32I-NEXT: or a6, a3, a0
-; RV32I-NEXT: srli a0, a6, 1
-; RV32I-NEXT: xori a7, a1, 31
-; RV32I-NEXT: srl a0, a0, a7
-; RV32I-NEXT: lbu a3, 13(a4)
-; RV32I-NEXT: lbu a5, 12(a4)
-; RV32I-NEXT: lbu t0, 14(a4)
-; RV32I-NEXT: lbu t1, 15(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
+; RV32I-NEXT: slli a7, a7, 16
+; RV32I-NEXT: slli t0, t0, 24
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
; RV32I-NEXT: slli t0, t0, 16
; RV32I-NEXT: slli t1, t1, 24
-; RV32I-NEXT: or a5, t1, t0
-; RV32I-NEXT: or t0, a5, a3
-; RV32I-NEXT: lbu a3, 9(a4)
-; RV32I-NEXT: lbu a5, 8(a4)
-; RV32I-NEXT: lbu t1, 10(a4)
-; RV32I-NEXT: lbu t2, 11(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, a5
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
; RV32I-NEXT: slli t1, t1, 16
; RV32I-NEXT: slli t2, t2, 24
-; RV32I-NEXT: or a5, t2, t1
-; RV32I-NEXT: or t1, a5, a3
-; RV32I-NEXT: srli a3, t1, 1
-; RV32I-NEXT: srl a5, a3, a7
-; RV32I-NEXT: srli t4, t5, 1
-; RV32I-NEXT: not t2, a1
-; RV32I-NEXT: lbu a3, 21(a4)
-; RV32I-NEXT: lbu t3, 20(a4)
-; RV32I-NEXT: lbu t6, 22(a4)
-; RV32I-NEXT: lbu s0, 23(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t3
-; RV32I-NEXT: slli t6, t6, 16
-; RV32I-NEXT: slli s0, s0, 24
-; RV32I-NEXT: or t3, s0, t6
-; RV32I-NEXT: or t3, t3, a3
-; RV32I-NEXT: lbu a3, 17(a4)
-; RV32I-NEXT: lbu t6, 16(a4)
-; RV32I-NEXT: lbu s0, 18(a4)
-; RV32I-NEXT: lbu s1, 19(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a3
-; RV32I-NEXT: lbu a3, 29(a4)
-; RV32I-NEXT: lbu t6, 28(a4)
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu s2, 31(a4)
-; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: or a3, a3, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: lbu s1, 25(a4)
-; RV32I-NEXT: lbu s2, 24(a4)
-; RV32I-NEXT: srl t4, t4, t2
-; RV32I-NEXT: or t6, t6, a3
-; RV32I-NEXT: slli s1, s1, 8
-; RV32I-NEXT: or a3, s1, s2
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu a4, 27(a4)
-; RV32I-NEXT: srli s2, s0, 1
-; RV32I-NEXT: srl s2, s2, a7
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: srli s1, t0, 1
-; RV32I-NEXT: srl s1, s1, t2
-; RV32I-NEXT: or a4, a4, a3
-; RV32I-NEXT: srli a3, a4, 1
-; RV32I-NEXT: srl a7, a3, a7
-; RV32I-NEXT: srli a3, t3, 1
-; RV32I-NEXT: srl t2, a3, t2
-; RV32I-NEXT: sll a3, t5, a1
-; RV32I-NEXT: sll t0, t0, a1
-; RV32I-NEXT: sll t1, t1, a1
-; RV32I-NEXT: sll t3, t3, a1
-; RV32I-NEXT: sll t5, s0, a1
-; RV32I-NEXT: sll t6, t6, a1
-; RV32I-NEXT: sll a4, a4, a1
-; RV32I-NEXT: sll a1, a6, a1
-; RV32I-NEXT: srli a6, a4, 24
-; RV32I-NEXT: sb a6, 27(a2)
-; RV32I-NEXT: srli a6, a4, 16
-; RV32I-NEXT: sb a6, 26(a2)
-; RV32I-NEXT: or a6, a4, t2
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
+; RV32I-NEXT: slli t3, t3, 16
+; RV32I-NEXT: slli t4, t4, 24
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or a0, a0, t4
+; RV32I-NEXT: or a0, a0, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t3, 0(a1)
+; RV32I-NEXT: lbu t4, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
+; RV32I-NEXT: slli t4, t4, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t4
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: sw zero, 28(sp)
+; RV32I-NEXT: sw zero, 24(sp)
+; RV32I-NEXT: sw zero, 20(sp)
+; RV32I-NEXT: sw zero, 16(sp)
+; RV32I-NEXT: sw zero, 12(sp)
+; RV32I-NEXT: sw zero, 8(sp)
+; RV32I-NEXT: sw zero, 4(sp)
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw t2, 56(sp)
+; RV32I-NEXT: sw t1, 52(sp)
+; RV32I-NEXT: sw t0, 48(sp)
+; RV32I-NEXT: sw a6, 44(sp)
+; RV32I-NEXT: sw a5, 40(sp)
+; RV32I-NEXT: sw a4, 36(sp)
+; RV32I-NEXT: sw a3, 32(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: addi a1, sp, 32
+; RV32I-NEXT: sub a4, a1, a0
+; RV32I-NEXT: lw a3, 4(a4)
+; RV32I-NEXT: lw a5, 0(a4)
+; RV32I-NEXT: sll a0, a3, a7
+; RV32I-NEXT: andi a1, a7, 31
+; RV32I-NEXT: xori a6, a1, 31
+; RV32I-NEXT: srli a1, a5, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 8(a4)
+; RV32I-NEXT: srl a1, a1, a6
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: sll a1, t0, a7
+; RV32I-NEXT: srli t2, t1, 1
+; RV32I-NEXT: srl t2, t2, a6
+; RV32I-NEXT: or a1, a1, t2
+; RV32I-NEXT: sll t1, t1, a7
+; RV32I-NEXT: srli a3, a3, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 16(a4)
+; RV32I-NEXT: srl a3, a3, a6
+; RV32I-NEXT: or a3, t1, a3
+; RV32I-NEXT: sll t1, t2, a7
+; RV32I-NEXT: srli t4, t3, 1
+; RV32I-NEXT: srl t4, t4, a6
+; RV32I-NEXT: or t1, t1, t4
+; RV32I-NEXT: sll t3, t3, a7
+; RV32I-NEXT: srli t0, t0, 1
+; RV32I-NEXT: lw t4, 28(a4)
+; RV32I-NEXT: lw a4, 24(a4)
+; RV32I-NEXT: srl t0, t0, a6
+; RV32I-NEXT: or t0, t3, t0
+; RV32I-NEXT: sll t3, t4, a7
+; RV32I-NEXT: srli t4, a4, 1
+; RV32I-NEXT: srl t4, t4, a6
+; RV32I-NEXT: or t3, t3, t4
+; RV32I-NEXT: sll a4, a4, a7
+; RV32I-NEXT: srli t2, t2, 1
+; RV32I-NEXT: srl a6, t2, a6
+; RV32I-NEXT: or a4, a4, a6
+; RV32I-NEXT: sll a5, a5, a7
+; RV32I-NEXT: sb a5, 0(a2)
+; RV32I-NEXT: srli a6, a5, 24
+; RV32I-NEXT: sb a6, 3(a2)
+; RV32I-NEXT: srli a6, a5, 16
+; RV32I-NEXT: sb a6, 2(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 1(a2)
+; RV32I-NEXT: sb a4, 24(a2)
+; RV32I-NEXT: sb t3, 28(a2)
+; RV32I-NEXT: sb t0, 16(a2)
+; RV32I-NEXT: sb t1, 20(a2)
+; RV32I-NEXT: sb a3, 8(a2)
+; RV32I-NEXT: sb a1, 12(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a5, a4, 24
+; RV32I-NEXT: sb a5, 27(a2)
+; RV32I-NEXT: srli a5, a4, 16
+; RV32I-NEXT: sb a5, 26(a2)
; RV32I-NEXT: srli a4, a4, 8
; RV32I-NEXT: sb a4, 25(a2)
-; RV32I-NEXT: srli a4, t6, 24
+; RV32I-NEXT: srli a4, t3, 24
; RV32I-NEXT: sb a4, 31(a2)
-; RV32I-NEXT: srli a4, t6, 16
+; RV32I-NEXT: srli a4, t3, 16
; RV32I-NEXT: sb a4, 30(a2)
-; RV32I-NEXT: or a4, t6, a7
-; RV32I-NEXT: srli a7, t6, 8
-; RV32I-NEXT: sb a7, 29(a2)
-; RV32I-NEXT: srli a7, t5, 24
-; RV32I-NEXT: sb a7, 19(a2)
-; RV32I-NEXT: srli a7, t5, 16
-; RV32I-NEXT: sb a7, 18(a2)
-; RV32I-NEXT: or a7, t5, s1
-; RV32I-NEXT: srli t2, t5, 8
-; RV32I-NEXT: sb t2, 17(a2)
-; RV32I-NEXT: srli t2, t3, 24
-; RV32I-NEXT: sb t2, 23(a2)
-; RV32I-NEXT: srli t2, t3, 16
-; RV32I-NEXT: sb t2, 22(a2)
-; RV32I-NEXT: or t2, t3, s2
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, t1, 24
-; RV32I-NEXT: sb t3, 11(a2)
-; RV32I-NEXT: srli t3, t1, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, t1, t4
-; RV32I-NEXT: srli t1, t1, 8
-; RV32I-NEXT: sb t1, 9(a2)
-; RV32I-NEXT: srli t1, t0, 24
-; RV32I-NEXT: sb t1, 15(a2)
-; RV32I-NEXT: srli t1, t0, 16
-; RV32I-NEXT: sb t1, 14(a2)
-; RV32I-NEXT: or a5, t0, a5
-; RV32I-NEXT: srli t0, t0, 8
-; RV32I-NEXT: sb t0, 13(a2)
-; RV32I-NEXT: srli t0, a1, 24
-; RV32I-NEXT: sb t0, 3(a2)
-; RV32I-NEXT: srli t0, a1, 16
-; RV32I-NEXT: sb t0, 2(a2)
-; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: srli a4, t3, 8
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 9(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 15(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 14(a2)
; RV32I-NEXT: srli a1, a1, 8
-; RV32I-NEXT: sb a1, 1(a2)
-; RV32I-NEXT: srli a1, a3, 24
+; RV32I-NEXT: sb a1, 13(a2)
+; RV32I-NEXT: srli a1, a0, 24
; RV32I-NEXT: sb a1, 7(a2)
-; RV32I-NEXT: srli a1, a3, 16
+; RV32I-NEXT: srli a1, a0, 16
; RV32I-NEXT: sb a1, 6(a2)
-; RV32I-NEXT: or a0, a3, a0
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 5(a2)
-; RV32I-NEXT: sb a6, 24(a2)
-; RV32I-NEXT: sb a4, 28(a2)
-; RV32I-NEXT: sb a7, 16(a2)
-; RV32I-NEXT: sb t2, 20(a2)
-; RV32I-NEXT: sb t3, 8(a2)
-; RV32I-NEXT: sb a5, 12(a2)
-; RV32I-NEXT: sb a0, 4(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-LABEL: ashr_32bytes:
; RV64I: # %bb.0:
-; RV64I-NEXT: addi sp, sp, -224
-; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t1, 31(a0)
-; RV64I-NEXT: lbu a3, 0(a0)
-; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: addi sp, sp, -64
; RV64I-NEXT: lbu a3, 1(a0)
-; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 2(a0)
-; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 3(a0)
-; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 4(a0)
-; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu a3, 5(a0)
-; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT: lbu t3, 6(a0)
-; RV64I-NEXT: lbu t4, 7(a0)
-; RV64I-NEXT: lbu t5, 8(a0)
-; RV64I-NEXT: lbu t6, 9(a0)
-; RV64I-NEXT: lbu s0, 10(a0)
-; RV64I-NEXT: lbu s1, 11(a0)
-; RV64I-NEXT: lbu s2, 12(a0)
-; RV64I-NEXT: lbu s3, 13(a0)
-; RV64I-NEXT: lbu s4, 14(a0)
-; RV64I-NEXT: lbu s5, 15(a0)
-; RV64I-NEXT: lbu s6, 16(a0)
-; RV64I-NEXT: lbu s7, 17(a0)
-; RV64I-NEXT: lbu s8, 18(a0)
-; RV64I-NEXT: lbu s9, 19(a0)
-; RV64I-NEXT: lbu a3, 1(a1)
-; RV64I-NEXT: lbu s10, 0(a1)
-; RV64I-NEXT: lbu s11, 2(a1)
-; RV64I-NEXT: lbu ra, 3(a1)
-; RV64I-NEXT: slli a3, a3, 8
-; RV64I-NEXT: or a3, a3, s10
-; RV64I-NEXT: slli s11, s11, 16
-; RV64I-NEXT: slli ra, ra, 24
-; RV64I-NEXT: lbu s10, 5(a1)
-; RV64I-NEXT: or s11, ra, s11
-; RV64I-NEXT: or a3, s11, a3
-; RV64I-NEXT: lbu s11, 4(a1)
-; RV64I-NEXT: slli s10, s10, 8
-; RV64I-NEXT: lbu ra, 6(a1)
-; RV64I-NEXT: lbu a1, 7(a1)
-; RV64I-NEXT: or s10, s10, s11
-; RV64I-NEXT: lbu s11, 20(a0)
-; RV64I-NEXT: slli ra, ra, 16
-; RV64I-NEXT: slli a1, a1, 24
-; RV64I-NEXT: or a1, a1, ra
-; RV64I-NEXT: lbu ra, 21(a0)
-; RV64I-NEXT: or a1, a1, s10
-; RV64I-NEXT: lbu s10, 22(a0)
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or t2, a1, a3
-; RV64I-NEXT: lbu t0, 23(a0)
-; RV64I-NEXT: lbu a7, 24(a0)
-; RV64I-NEXT: lbu a6, 25(a0)
-; RV64I-NEXT: lbu a5, 26(a0)
-; RV64I-NEXT: lbu a1, 30(a0)
-; RV64I-NEXT: lbu a3, 29(a0)
-; RV64I-NEXT: lbu a4, 28(a0)
-; RV64I-NEXT: lbu a0, 27(a0)
-; RV64I-NEXT: sb a1, 86(sp)
-; RV64I-NEXT: sb a3, 85(sp)
-; RV64I-NEXT: sb a4, 84(sp)
-; RV64I-NEXT: sb a0, 83(sp)
-; RV64I-NEXT: sb a5, 82(sp)
-; RV64I-NEXT: sb a6, 81(sp)
-; RV64I-NEXT: sb a7, 80(sp)
-; RV64I-NEXT: sb t0, 79(sp)
-; RV64I-NEXT: sb s10, 78(sp)
-; RV64I-NEXT: sb ra, 77(sp)
-; RV64I-NEXT: sb s11, 76(sp)
-; RV64I-NEXT: sb s9, 75(sp)
-; RV64I-NEXT: sb s8, 74(sp)
-; RV64I-NEXT: sb s7, 73(sp)
-; RV64I-NEXT: sb s6, 72(sp)
-; RV64I-NEXT: sb s5, 71(sp)
-; RV64I-NEXT: sb s4, 70(sp)
-; RV64I-NEXT: sb s3, 69(sp)
-; RV64I-NEXT: sb s2, 68(sp)
-; RV64I-NEXT: sb s1, 67(sp)
-; RV64I-NEXT: sb s0, 66(sp)
-; RV64I-NEXT: sb t6, 65(sp)
-; RV64I-NEXT: sb t5, 64(sp)
-; RV64I-NEXT: sb t1, 87(sp)
-; RV64I-NEXT: slli t1, t1, 56
-; RV64I-NEXT: sb t4, 63(sp)
-; RV64I-NEXT: sb t3, 62(sp)
-; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 61(sp)
-; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 60(sp)
-; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 59(sp)
-; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 58(sp)
-; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 57(sp)
-; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload
-; RV64I-NEXT: sb a0, 56(sp)
-; RV64I-NEXT: srai a0, t1, 63
-; RV64I-NEXT: sb a0, 112(sp)
-; RV64I-NEXT: sb a0, 104(sp)
-; RV64I-NEXT: sb a0, 96(sp)
-; RV64I-NEXT: sb a0, 88(sp)
-; RV64I-NEXT: srli a1, a0, 56
-; RV64I-NEXT: sb a1, 119(sp)
-; RV64I-NEXT: srli a3, a0, 48
-; RV64I-NEXT: sb a3, 118(sp)
-; RV64I-NEXT: srli a4, a0, 40
-; RV64I-NEXT: sb a4, 117(sp)
-; RV64I-NEXT: srli a5, a0, 32
-; RV64I-NEXT: sb a5, 116(sp)
-; RV64I-NEXT: srli a6, a0, 24
-; RV64I-NEXT: sb a6, 115(sp)
-; RV64I-NEXT: srli a7, a0, 16
-; RV64I-NEXT: sb a7, 114(sp)
-; RV64I-NEXT: srli a0, a0, 8
-; RV64I-NEXT: sb a0, 113(sp)
-; RV64I-NEXT: sb a1, 111(sp)
-; RV64I-NEXT: sb a3, 110(sp)
-; RV64I-NEXT: sb a4, 109(sp)
-; RV64I-NEXT: sb a5, 108(sp)
-; RV64I-NEXT: sb a6, 107(sp)
-; RV64I-NEXT: sb a7, 106(sp)
-; RV64I-NEXT: sb a0, 105(sp)
-; RV64I-NEXT: sb a1, 103(sp)
-; RV64I-NEXT: sb a3, 102(sp)
-; RV64I-NEXT: sb a4, 101(sp)
-; RV64I-NEXT: sb a5, 100(sp)
-; RV64I-NEXT: sb a6, 99(sp)
-; RV64I-NEXT: sb a7, 98(sp)
-; RV64I-NEXT: sb a0, 97(sp)
-; RV64I-NEXT: sb a1, 95(sp)
-; RV64I-NEXT: sb a3, 94(sp)
-; RV64I-NEXT: sb a4, 93(sp)
-; RV64I-NEXT: sb a5, 92(sp)
-; RV64I-NEXT: sb a6, 91(sp)
-; RV64I-NEXT: sb a7, 90(sp)
-; RV64I-NEXT: sb a0, 89(sp)
-; RV64I-NEXT: slli a0, t2, 56
-; RV64I-NEXT: srli a0, a0, 59
-; RV64I-NEXT: addi a1, sp, 56
-; RV64I-NEXT: add a1, a1, a0
-; RV64I-NEXT: lbu a0, 9(a1)
-; RV64I-NEXT: lbu a3, 8(a1)
-; RV64I-NEXT: lbu a4, 10(a1)
-; RV64I-NEXT: lbu a5, 11(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a3
-; RV64I-NEXT: slli a4, a4, 16
-; RV64I-NEXT: slli a5, a5, 24
-; RV64I-NEXT: or a4, a5, a4
-; RV64I-NEXT: or a0, a4, a0
-; RV64I-NEXT: lbu a3, 13(a1)
-; RV64I-NEXT: lbu a4, 12(a1)
-; RV64I-NEXT: lbu a5, 14(a1)
-; RV64I-NEXT: lbu a6, 15(a1)
+; RV64I-NEXT: lbu a4, 0(a0)
+; RV64I-NEXT: lbu a5, 2(a0)
+; RV64I-NEXT: lbu a6, 3(a0)
; RV64I-NEXT: slli a3, a3, 8
; RV64I-NEXT: or a3, a3, a4
; RV64I-NEXT: slli a5, a5, 16
; RV64I-NEXT: slli a6, a6, 24
; RV64I-NEXT: or a4, a6, a5
; RV64I-NEXT: or a3, a4, a3
-; RV64I-NEXT: slli a3, a3, 32
-; RV64I-NEXT: or a4, a3, a0
-; RV64I-NEXT: andi a3, t2, 7
-; RV64I-NEXT: lbu a0, 17(a1)
-; RV64I-NEXT: lbu a5, 16(a1)
-; RV64I-NEXT: lbu a6, 18(a1)
-; RV64I-NEXT: lbu a7, 19(a1)
-; RV64I-NEXT: slli a0, a0, 8
-; RV64I-NEXT: or a0, a0, a5
+; RV64I-NEXT: lbu a4, 5(a0)
+; RV64I-NEXT: lbu a5, 4(a0)
+; RV64I-NEXT: lbu a6, 6(a0)
+; RV64I-NEXT: lbu a7, 7(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
; RV64I-NEXT: slli a6, a6, 16
; RV64I-NEXT: slli a7, a7, 24
; RV64I-NEXT: or a5, a7, a6
-; RV64I-NEXT: or a0, a5, a0
-; RV64I-NEXT: lbu a5, 21(a1)
-; RV64I-NEXT: lbu a6, 20(a1)
-; RV64I-NEXT: lbu a7, 22(a1)
-; RV64I-NEXT: lbu t0, 23(a1)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: slli a4, a4, 32
+; RV64I-NEXT: or a3, a4, a3
+; RV64I-NEXT: lbu a4, 9(a0)
+; RV64I-NEXT: lbu a5, 8(a0)
+; RV64I-NEXT: lbu a6, 10(a0)
+; RV64I-NEXT: lbu a7, 11(a0)
+; RV64I-NEXT: slli a4, a4, 8
+; RV64I-NEXT: or a4, a4, a5
+; RV64I-NEXT: slli a6, a6, 16
+; RV64I-NEXT: slli a7, a7, 24
+; RV64I-NEXT: or a5, a7, a6
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 13(a0)
+; RV64I-NEXT: lbu a6, 12(a0)
+; RV64I-NEXT: lbu a7, 14(a0)
+; RV64I-NEXT: lbu t0, 15(a0)
; RV64I-NEXT: slli a5, a5, 8
; RV64I-NEXT: or a5, a5, a6
; RV64I-NEXT: slli a7, a7, 16
@@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; RV64I-NEXT: or a6, t0, a7
; RV64I-NEXT: or a5, a6, a5
; RV64I-NEXT: slli a5, a5, 32
-; RV64I-NEXT: or a5, a5, a0
-; RV64I-NEXT: slli a0, a5, 1
-; RV64I-NEXT: not a6, a3
-; RV64I-NEXT: sll a0, a0, a6
-; RV64I-NEXT: lbu a6, 1(a1)
-; RV64I-NEXT: lbu a7, 0(a1)
-; RV64I-NEXT: lbu t0, 2(a1)
-; RV64I-NEXT: lbu t1, 3(a1)
+; RV64I-NEXT: or a4, a5, a4
+; RV64I-NEXT: lbu a5, 17(a0)
+; RV64I-NEXT: lbu a6, 16(a0)
+; RV64I-NEXT: lbu a7, 18(a0)
+; RV64I-NEXT: lbu t0, 19(a0)
+; RV64I-NEXT: slli a5, a5, 8
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: slli a7, a7, 16
+; RV64I-NEXT: slli t0, t0, 24
+; RV64I-NEXT: or a6, t0, a7
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 21(a0)
+; RV64I-NEXT: lbu a7, 20(a0)
+; RV64I-NEXT: lbu t0, 22(a0)
+; RV64I-NEXT: lbu t1, 23(a0)
; RV64I-NEXT: slli a6, a6, 8
; RV64I-NEXT: or a6, a6, a7
; RV64I-NEXT: slli t0, t0, 16
; RV64I-NEXT: slli t1, t1, 24
; RV64I-NEXT: or a7, t1, t0
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 5(a1)
-; RV64I-NEXT: lbu t0, 4(a1)
-; RV64I-NEXT: lbu t1, 6(a1)
-; RV64I-NEXT: lbu t2, 7(a1)
+; RV64I-NEXT: slli a6, a6, 32
+; RV64I-NEXT: or a5, a6, a5
+; RV64I-NEXT: lbu a6, 25(a0)
+; RV64I-NEXT: lbu a7, 24(a0)
+; RV64I-NEXT: lbu t0, 26(a0)
+; RV64I-NEXT: lbu t1, 27(a0)
+; RV64I-NEXT: slli a6, a6, 8
+; RV64I-NEXT: or a6, a6, a7
+; RV64I-NEXT: slli t0, t0, 16
+; RV64I-NEXT: slli t1, t1, 24
+; RV64I-NEXT: or a7, t1, t0
+; RV64I-NEXT: or a6, a7, a6
+; RV64I-NEXT: lbu a7, 29(a0)
+; RV64I-NEXT: lbu t0, 28(a0)
+; RV64I-NEXT: lbu t1, 30(a0)
+; RV64I-NEXT: lbu a0, 31(a0)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
-; RV64I-NEXT: slli t2, t2, 24
-; RV64I-NEXT: or t0, t2, t1
-; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: slli a7, a7, 32
+; RV64I-NEXT: slli a0, a0, 24
+; RV64I-NEXT: or a0, a0, t1
+; RV64I-NEXT: or a0, a0, a7
+; RV64I-NEXT: slli a7, a0, 32
; RV64I-NEXT: or a6, a7, a6
-; RV64I-NEXT: lbu a7, 25(a1)
-; RV64I-NEXT: lbu t0, 24(a1)
-; RV64I-NEXT: lbu t1, 26(a1)
-; RV64I-NEXT: lbu t2, 27(a1)
+; RV64I-NEXT: lbu a7, 1(a1)
+; RV64I-NEXT: lbu t0, 0(a1)
+; RV64I-NEXT: lbu t1, 2(a1)
+; RV64I-NEXT: lbu t2, 3(a1)
; RV64I-NEXT: slli a7, a7, 8
; RV64I-NEXT: or a7, a7, t0
; RV64I-NEXT: slli t1, t1, 16
; RV64I-NEXT: slli t2, t2, 24
; RV64I-NEXT: or t0, t2, t1
; RV64I-NEXT: or a7, t0, a7
-; RV64I-NEXT: lbu t0, 29(a1)
-; RV64I-NEXT: lbu t1, 28(a1)
-; RV64I-NEXT: lbu t2, 30(a1)
-; RV64I-NEXT: lbu a1, 31(a1)
+; RV64I-NEXT: lbu t0, 5(a1)
+; RV64I-NEXT: lbu t1, 4(a1)
+; RV64I-NEXT: lbu t2, 6(a1)
+; RV64I-NEXT: lbu a1, 7(a1)
; RV64I-NEXT: slli t0, t0, 8
; RV64I-NEXT: or t0, t0, t1
; RV64I-NEXT: slli t2, t2, 16
; RV64I-NEXT: slli a1, a1, 24
; RV64I-NEXT: or a1, a1, t2
-; RV64I-NEXT: slli t1, a4, 1
; RV64I-NEXT: or a1, a1, t0
-; RV64I-NEXT: xori t0, a3, 63
-; RV64I-NEXT: sll t1, t1, t0
; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: or a7, a1, a7
-; RV64I-NEXT: slli a1, a7, 1
-; RV64I-NEXT: sll t0, a1, t0
-; RV64I-NEXT: srl a1, a4, a3
-; RV64I-NEXT: srl a4, a6, a3
-; RV64I-NEXT: srl a5, a5, a3
-; RV64I-NEXT: sra a3, a7, a3
-; RV64I-NEXT: srli a6, a5, 48
-; RV64I-NEXT: sb a6, 22(a2)
-; RV64I-NEXT: srli a6, a5, 40
-; RV64I-NEXT: sb a6, 21(a2)
-; RV64I-NEXT: srli a6, a5, 32
-; RV64I-NEXT: sb a6, 20(a2)
-; RV64I-NEXT: srli a6, a5, 24
-; RV64I-NEXT: sb a6, 19(a2)
-; RV64I-NEXT: srli a6, a5, 16
-; RV64I-NEXT: sb a6, 18(a2)
-; RV64I-NEXT: or a6, a5, t0
+; RV64I-NEXT: or a1, a1, a7
+; RV64I-NEXT: sraiw a0, a0, 31
+; RV64I-NEXT: sd a0, 56(sp)
+; RV64I-NEXT: sd a0, 48(sp)
+; RV64I-NEXT: sd a0, 40(sp)
+; RV64I-NEXT: sd a0, 32(sp)
+; RV64I-NEXT: sd a6, 24(sp)
+; RV64I-NEXT: sd a5, 16(sp)
+; RV64I-NEXT: sd a4, 8(sp)
+; RV64I-NEXT: sd a3, 0(sp)
+; RV64I-NEXT: srli a0, a1, 3
+; RV64I-NEXT: andi a0, a0, 24
+; RV64I-NEXT: mv a3, sp
+; RV64I-NEXT: add a3, a3, a0
+; RV64I-NEXT: ld a4, 8(a3)
+; RV64I-NEXT: srl a0, a4, a1
+; RV64I-NEXT: ld a5, 16(a3)
+; RV64I-NEXT: andi a6, a1, 63
+; RV64I-NEXT: xori a6, a6, 63
+; RV64I-NEXT: ld a7, 0(a3)
+; RV64I-NEXT: slli t0, a5, 1
+; RV64I-NEXT: sll t0, t0, a6
+; RV64I-NEXT: or a0, a0, t0
+; RV64I-NEXT: srl a7, a7, a1
+; RV64I-NEXT: slli a4, a4, 1
+; RV64I-NEXT: ld a3, 24(a3)
+; RV64I-NEXT: sll a4, a4, a6
+; RV64I-NEXT: or a4, a7, a4
+; RV64I-NEXT: srl a5, a5, a1
+; RV64I-NEXT: slli a7, a3, 1
+; RV64I-NEXT: sll a6, a7, a6
+; RV64I-NEXT: or a5, a5, a6
+; RV64I-NEXT: sra a1, a3, a1
+; RV64I-NEXT: sb a1, 24(a2)
+; RV64I-NEXT: srli a3, a1, 56
+; RV64I-NEXT: sb a3, 31(a2)
+; RV64I-NEXT: srli a3, a1, 48
+; RV64I-NEXT: sb a3, 30(a2)
+; RV64I-NEXT: srli a3, a1, 40
+; RV64I-NEXT: sb a3, 29(a2)
+; RV64I-NEXT: srli a3, a1, 32
+; RV64I-NEXT: sb a3, 28(a2)
+; RV64I-NEXT: srli a3, a1, 24
+; RV64I-NEXT: sb a3, 27(a2)
+; RV64I-NEXT: srli a3, a1, 16
+; RV64I-NEXT: sb a3, 26(a2)
+; RV64I-NEXT: srli a1, a1, 8
+; RV64I-NEXT: sb a1, 25(a2)
; RV64I-NEXT: sb a5, 16(a2)
+; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: sb a0, 8(a2)
+; RV64I-NEXT: srli a1, a5, 56
+; RV64I-NEXT: sb a1, 23(a2)
+; RV64I-NEXT: srli a1, a5, 48
+; RV64I-NEXT: sb a1, 22(a2)
+; RV64I-NEXT: srli a1, a5, 40
+; RV64I-NEXT: sb a1, 21(a2)
+; RV64I-NEXT: srli a1, a5, 32
+; RV64I-NEXT: sb a1, 20(a2)
+; RV64I-NEXT: srli a1, a5, 24
+; RV64I-NEXT: sb a1, 19(a2)
+; RV64I-NEXT: srli a1, a5, 16
+; RV64I-NEXT: sb a1, 18(a2)
; RV64I-NEXT: srli a5, a5, 8
; RV64I-NEXT: sb a5, 17(a2)
-; RV64I-NEXT: srli a5, a3, 56
-; RV64I-NEXT: sb a5, 31(a2)
-; RV64I-NEXT: srli a5, a3, 48
-; RV64I-NEXT: sb a5, 30(a2)
-; RV64I-NEXT: srli a5, a3, 40
-; RV64I-NEXT: sb a5, 29(a2)
-; RV64I-NEXT: srli a5, a3, 32
-; RV64I-NEXT: sb a5, 28(a2)
-; RV64I-NEXT: srli a5, a3, 24
-; RV64I-NEXT: sb a5, 27(a2)
-; RV64I-NEXT: srli a5, a3, 16
-; RV64I-NEXT: sb a5, 26(a2)
-; RV64I-NEXT: sb a3, 24(a2)
-; RV64I-NEXT: srli a3, a3, 8
-; RV64I-NEXT: sb a3, 25(a2)
-; RV64I-NEXT: srli a3, a4, 48
-; RV64I-NEXT: sb a3, 6(a2)
-; RV64I-NEXT: srli a3, a4, 40
-; RV64I-NEXT: sb a3, 5(a2)
-; RV64I-NEXT: srli a3, a4, 32
-; RV64I-NEXT: sb a3, 4(a2)
-; RV64I-NEXT: srli a3, a4, 24
-; RV64I-NEXT: sb a3, 3(a2)
-; RV64I-NEXT: srli a3, a4, 16
-; RV64I-NEXT: sb a3, 2(a2)
-; RV64I-NEXT: or a3, a4, t1
-; RV64I-NEXT: sb a4, 0(a2)
+; RV64I-NEXT: srli a1, a4, 56
+; RV64I-NEXT: sb a1, 7(a2)
+; RV64I-NEXT: srli a1, a4, 48
+; RV64I-NEXT: sb a1, 6(a2)
+; RV64I-NEXT: srli a1, a4, 40
+; RV64I-NEXT: sb a1, 5(a2)
+; RV64I-NEXT: srli a1, a4, 32
+; RV64I-NEXT: sb a1, 4(a2)
+; RV64I-NEXT: srli a1, a4, 24
+; RV64I-NEXT: sb a1, 3(a2)
+; RV64I-NEXT: srli a1, a4, 16
+; RV64I-NEXT: sb a1, 2(a2)
; RV64I-NEXT: srli a4, a4, 8
; RV64I-NEXT: sb a4, 1(a2)
-; RV64I-NEXT: srli a4, a1, 48
-; RV64I-NEXT: sb a4, 14(a2)
-; RV64I-NEXT: srli a4, a1, 40
-; RV64I-NEXT: sb a4, 13(a2)
-; RV64I-NEXT: srli a4, a1, 32
-; RV64I-NEXT: sb a4, 12(a2)
-; RV64I-NEXT: srli a4, a1, 24
-; RV64I-NEXT: sb a4, 11(a2)
-; RV64I-NEXT: srli a4, a1, 16
-; RV64I-NEXT: sb a4, 10(a2)
-; RV64I-NEXT: or a0, a1, a0
-; RV64I-NEXT: sb a1, 8(a2)
-; RV64I-NEXT: srli a1, a1, 8
-; RV64I-NEXT: sb a1, 9(a2)
-; RV64I-NEXT: srli a1, a6, 56
-; RV64I-NEXT: sb a1, 23(a2)
-; RV64I-NEXT: srli a3, a3, 56
-; RV64I-NEXT: sb a3, 7(a2)
-; RV64I-NEXT: srli a0, a0, 56
-; RV64I-NEXT: sb a0, 15(a2)
-; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT: addi sp, sp, 224
+; RV64I-NEXT: srli a1, a0, 56
+; RV64I-NEXT: sb a1, 15(a2)
+; RV64I-NEXT: srli a1, a0, 48
+; RV64I-NEXT: sb a1, 14(a2)
+; RV64I-NEXT: srli a1, a0, 40
+; RV64I-NEXT: sb a1, 13(a2)
+; RV64I-NEXT: srli a1, a0, 32
+; RV64I-NEXT: sb a1, 12(a2)
+; RV64I-NEXT: srli a1, a0, 24
+; RV64I-NEXT: sb a1, 11(a2)
+; RV64I-NEXT: srli a1, a0, 16
+; RV64I-NEXT: sb a1, 10(a2)
+; RV64I-NEXT: srli a0, a0, 8
+; RV64I-NEXT: sb a0, 9(a2)
+; RV64I-NEXT: addi sp, sp, 64
; RV64I-NEXT: ret
;
; RV32I-LABEL: ashr_32bytes:
; RV32I: # %bb.0:
-; RV32I-NEXT: addi sp, sp, -144
-; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
-; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t3, 31(a0)
-; RV32I-NEXT: lbu a3, 0(a0)
-; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT: addi sp, sp, -64
; RV32I-NEXT: lbu a3, 1(a0)
-; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 2(a0)
-; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 3(a0)
-; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 4(a0)
-; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu a3, 5(a0)
-; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT: lbu t2, 6(a0)
-; RV32I-NEXT: lbu t4, 7(a0)
-; RV32I-NEXT: lbu t5, 8(a0)
-; RV32I-NEXT: lbu t6, 9(a0)
-; RV32I-NEXT: lbu s0, 10(a0)
-; RV32I-NEXT: lbu s1, 11(a0)
-; RV32I-NEXT: lbu s2, 12(a0)
-; RV32I-NEXT: lbu s3, 13(a0)
-; RV32I-NEXT: lbu s4, 14(a0)
-; RV32I-NEXT: lbu s5, 15(a0)
-; RV32I-NEXT: lbu s6, 16(a0)
-; RV32I-NEXT: lbu s7, 17(a0)
-; RV32I-NEXT: lbu s8, 18(a0)
-; RV32I-NEXT: lbu a3, 1(a1)
-; RV32I-NEXT: lbu s9, 19(a0)
-; RV32I-NEXT: lbu s10, 20(a0)
-; RV32I-NEXT: lbu s11, 0(a1)
+; RV32I-NEXT: lbu a4, 0(a0)
+; RV32I-NEXT: lbu a5, 2(a0)
+; RV32I-NEXT: lbu a6, 3(a0)
; RV32I-NEXT: slli a3, a3, 8
-; RV32I-NEXT: lbu ra, 2(a1)
-; RV32I-NEXT: lbu a1, 3(a1)
-; RV32I-NEXT: or a3, a3, s11
-; RV32I-NEXT: lbu s11, 21(a0)
-; RV32I-NEXT: slli ra, ra, 16
-; RV32I-NEXT: slli a1, a1, 24
-; RV32I-NEXT: or a1, a1, ra
-; RV32I-NEXT: lbu ra, 22(a0)
-; RV32I-NEXT: or t1, a1, a3
-; RV32I-NEXT: lbu t0, 23(a0)
-; RV32I-NEXT: lbu a7, 24(a0)
-; RV32I-NEXT: lbu a6, 25(a0)
-; RV32I-NEXT: lbu a5, 26(a0)
-; RV32I-NEXT: lbu a1, 30(a0)
-; RV32I-NEXT: lbu a3, 29(a0)
-; RV32I-NEXT: lbu a4, 28(a0)
-; RV32I-NEXT: lbu a0, 27(a0)
-; RV32I-NEXT: sb a1, 58(sp)
-; RV32I-NEXT: sb a3, 57(sp)
-; RV32I-NEXT: sb a4, 56(sp)
-; RV32I-NEXT: sb a0, 55(sp)
-; RV32I-NEXT: sb a5, 54(sp)
-; RV32I-NEXT: sb a6, 53(sp)
-; RV32I-NEXT: sb a7, 52(sp)
-; RV32I-NEXT: sb t0, 51(sp)
-; RV32I-NEXT: sb ra, 50(sp)
-; RV32I-NEXT: sb s11, 49(sp)
-; RV32I-NEXT: sb s10, 48(sp)
-; RV32I-NEXT: sb s9, 47(sp)
-; RV32I-NEXT: sb s8, 46(sp)
-; RV32I-NEXT: sb s7, 45(sp)
-; RV32I-NEXT: sb s6, 44(sp)
-; RV32I-NEXT: sb s5, 43(sp)
-; RV32I-NEXT: sb t3, 59(sp)
-; RV32I-NEXT: slli t3, t3, 24
-; RV32I-NEXT: sb s4, 42(sp)
-; RV32I-NEXT: sb s3, 41(sp)
-; RV32I-NEXT: sb s2, 40(sp)
-; RV32I-NEXT: sb s1, 39(sp)
-; RV32I-NEXT: sb s0, 38(sp)
-; RV32I-NEXT: sb t6, 37(sp)
-; RV32I-NEXT: sb t5, 36(sp)
-; RV32I-NEXT: sb t4, 35(sp)
-; RV32I-NEXT: sb t2, 34(sp)
-; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 33(sp)
-; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 32(sp)
-; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 31(sp)
-; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 30(sp)
-; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 29(sp)
-; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload
-; RV32I-NEXT: sb a0, 28(sp)
-; RV32I-NEXT: srai a0, t3, 31
-; RV32I-NEXT: sb a0, 88(sp)
-; RV32I-NEXT: sb a0, 84(sp)
-; RV32I-NEXT: sb a0, 80(sp)
-; RV32I-NEXT: sb a0, 76(sp)
-; RV32I-NEXT: sb a0, 72(sp)
-; RV32I-NEXT: sb a0, 68(sp)
-; RV32I-NEXT: sb a0, 64(sp)
-; RV32I-NEXT: sb a0, 60(sp)
-; RV32I-NEXT: srli a1, a0, 24
-; RV32I-NEXT: sb a1, 91(sp)
-; RV32I-NEXT: srli a3, a0, 16
-; RV32I-NEXT: sb a3, 90(sp)
-; RV32I-NEXT: srli a0, a0, 8
-; RV32I-NEXT: sb a0, 89(sp)
-; RV32I-NEXT: sb a1, 87(sp)
-; RV32I-NEXT: sb a3, 86(sp)
-; RV32I-NEXT: sb a0, 85(sp)
-; RV32I-NEXT: sb a1, 83(sp)
-; RV32I-NEXT: sb a3, 82(sp)
-; RV32I-NEXT: sb a0, 81(sp)
-; RV32I-NEXT: sb a1, 79(sp)
-; RV32I-NEXT: sb a3, 78(sp)
-; RV32I-NEXT: sb a0, 77(sp)
-; RV32I-NEXT: sb a1, 75(sp)
-; RV32I-NEXT: sb a3, 74(sp)
-; RV32I-NEXT: sb a0, 73(sp)
-; RV32I-NEXT: sb a1, 71(sp)
-; RV32I-NEXT: sb a3, 70(sp)
-; RV32I-NEXT: sb a0, 69(sp)
-; RV32I-NEXT: sb a1, 67(sp)
-; RV32I-NEXT: sb a3, 66(sp)
-; RV32I-NEXT: sb a0, 65(sp)
-; RV32I-NEXT: sb a1, 63(sp)
-; RV32I-NEXT: sb a3, 62(sp)
-; RV32I-NEXT: sb a0, 61(sp)
-; RV32I-NEXT: slli a0, t1, 24
-; RV32I-NEXT: srli a0, a0, 27
-; RV32I-NEXT: addi a4, sp, 28
-; RV32I-NEXT: add a4, a4, a0
-; RV32I-NEXT: lbu a0, 5(a4)
-; RV32I-NEXT: lbu a1, 4(a4)
-; RV32I-NEXT: lbu a3, 6(a4)
-; RV32I-NEXT: lbu a5, 7(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
-; RV32I-NEXT: slli a3, a3, 16
-; RV32I-NEXT: slli a5, a5, 24
-; RV32I-NEXT: or a3, a5, a3
-; RV32I-NEXT: or t5, a3, a0
-; RV32I-NEXT: andi a3, t1, 7
-; RV32I-NEXT: lbu a0, 9(a4)
-; RV32I-NEXT: lbu a1, 8(a4)
-; RV32I-NEXT: lbu a5, 10(a4)
-; RV32I-NEXT: lbu a6, 11(a4)
-; RV32I-NEXT: slli a0, a0, 8
-; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: or a3, a3, a4
; RV32I-NEXT: slli a5, a5, 16
; RV32I-NEXT: slli a6, a6, 24
-; RV32I-NEXT: or a1, a6, a5
-; RV32I-NEXT: or a6, a1, a0
-; RV32I-NEXT: slli a0, a6, 1
-; RV32I-NEXT: not t1, a3
-; RV32I-NEXT: sll a0, a0, t1
-; RV32I-NEXT: lbu a1, 1(a4)
-; RV32I-NEXT: lbu a5, 0(a4)
-; RV32I-NEXT: lbu a7, 2(a4)
-; RV32I-NEXT: lbu t0, 3(a4)
-; RV32I-NEXT: slli a1, a1, 8
-; RV32I-NEXT: or a1, a1, a5
+; RV32I-NEXT: or a4, a6, a5
+; RV32I-NEXT: or a3, a4, a3
+; RV32I-NEXT: lbu a4, 5(a0)
+; RV32I-NEXT: lbu a5, 4(a0)
+; RV32I-NEXT: lbu a6, 6(a0)
+; RV32I-NEXT: lbu a7, 7(a0)
+; RV32I-NEXT: slli a4, a4, 8
+; RV32I-NEXT: or a4, a4, a5
+; RV32I-NEXT: slli a6, a6, 16
+; RV32I-NEXT: slli a7, a7, 24
+; RV32I-NEXT: or a5, a7, a6
+; RV32I-NEXT: or a4, a5, a4
+; RV32I-NEXT: lbu a5, 9(a0)
+; RV32I-NEXT: lbu a6, 8(a0)
+; RV32I-NEXT: lbu a7, 10(a0)
+; RV32I-NEXT: lbu t0, 11(a0)
+; RV32I-NEXT: slli a5, a5, 8
+; RV32I-NEXT: or a5, a5, a6
; RV32I-NEXT: slli a7, a7, 16
; RV32I-NEXT: slli t0, t0, 24
-; RV32I-NEXT: or a5, t0, a7
-; RV32I-NEXT: or t0, a5, a1
-; RV32I-NEXT: slli a1, t5, 1
-; RV32I-NEXT: xori t2, a3, 31
-; RV32I-NEXT: sll a1, a1, t2
-; RV32I-NEXT: lbu a5, 13(a4)
-; RV32I-NEXT: lbu a7, 12(a4)
-; RV32I-NEXT: lbu t3, 14(a4)
-; RV32I-NEXT: lbu t4, 15(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or a6, t0, a7
+; RV32I-NEXT: or a5, a6, a5
+; RV32I-NEXT: lbu a6, 13(a0)
+; RV32I-NEXT: lbu a7, 12(a0)
+; RV32I-NEXT: lbu t0, 14(a0)
+; RV32I-NEXT: lbu t1, 15(a0)
+; RV32I-NEXT: slli a6, a6, 8
+; RV32I-NEXT: or a6, a6, a7
+; RV32I-NEXT: slli t0, t0, 16
+; RV32I-NEXT: slli t1, t1, 24
+; RV32I-NEXT: or a7, t1, t0
+; RV32I-NEXT: or a6, a7, a6
+; RV32I-NEXT: lbu a7, 17(a0)
+; RV32I-NEXT: lbu t0, 16(a0)
+; RV32I-NEXT: lbu t1, 18(a0)
+; RV32I-NEXT: lbu t2, 19(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t0
+; RV32I-NEXT: slli t1, t1, 16
+; RV32I-NEXT: slli t2, t2, 24
+; RV32I-NEXT: or t0, t2, t1
+; RV32I-NEXT: or t0, t0, a7
+; RV32I-NEXT: lbu a7, 21(a0)
+; RV32I-NEXT: lbu t1, 20(a0)
+; RV32I-NEXT: lbu t2, 22(a0)
+; RV32I-NEXT: lbu t3, 23(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t1
+; RV32I-NEXT: slli t2, t2, 16
+; RV32I-NEXT: slli t3, t3, 24
+; RV32I-NEXT: or t1, t3, t2
+; RV32I-NEXT: or t1, t1, a7
+; RV32I-NEXT: lbu a7, 25(a0)
+; RV32I-NEXT: lbu t2, 24(a0)
+; RV32I-NEXT: lbu t3, 26(a0)
+; RV32I-NEXT: lbu t4, 27(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t2
; RV32I-NEXT: slli t3, t3, 16
; RV32I-NEXT: slli t4, t4, 24
-; RV32I-NEXT: or a7, t4, t3
-; RV32I-NEXT: or t3, a7, a5
-; RV32I-NEXT: lbu a5, 17(a4)
-; RV32I-NEXT: lbu a7, 16(a4)
-; RV32I-NEXT: lbu t4, 18(a4)
-; RV32I-NEXT: lbu t6, 19(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, a7
+; RV32I-NEXT: or t2, t4, t3
+; RV32I-NEXT: or t2, t2, a7
+; RV32I-NEXT: lbu a7, 29(a0)
+; RV32I-NEXT: lbu t3, 28(a0)
+; RV32I-NEXT: lbu t4, 30(a0)
+; RV32I-NEXT: lbu a0, 31(a0)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t3
; RV32I-NEXT: slli t4, t4, 16
-; RV32I-NEXT: slli t6, t6, 24
-; RV32I-NEXT: or a7, t6, t4
-; RV32I-NEXT: or t4, a7, a5
-; RV32I-NEXT: slli a5, t4, 1
-; RV32I-NEXT: sll a7, a5, t1
-; RV32I-NEXT: lbu a5, 21(a4)
-; RV32I-NEXT: lbu t6, 20(a4)
-; RV32I-NEXT: lbu s0, 22(a4)
-; RV32I-NEXT: lbu s1, 23(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s0, s0, 16
-; RV32I-NEXT: slli s1, s1, 24
-; RV32I-NEXT: or s0, s1, s0
-; RV32I-NEXT: or s0, s0, a5
-; RV32I-NEXT: lbu a5, 25(a4)
-; RV32I-NEXT: lbu t6, 24(a4)
-; RV32I-NEXT: lbu s1, 26(a4)
-; RV32I-NEXT: lbu s2, 27(a4)
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, t6
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli s2, s2, 24
-; RV32I-NEXT: or t6, s2, s1
-; RV32I-NEXT: or t6, t6, a5
-; RV32I-NEXT: lbu a5, 29(a4)
-; RV32I-NEXT: lbu s1, 28(a4)
-; RV32I-NEXT: slli s2, t6, 1
-; RV32I-NEXT: sll t1, s2, t1
-; RV32I-NEXT: slli a5, a5, 8
-; RV32I-NEXT: or a5, a5, s1
-; RV32I-NEXT: lbu s1, 30(a4)
-; RV32I-NEXT: lbu a4, 31(a4)
-; RV32I-NEXT: slli s2, t3, 1
-; RV32I-NEXT: sll s2, s2, t2
-; RV32I-NEXT: slli s1, s1, 16
-; RV32I-NEXT: slli a4, a4, 24
-; RV32I-NEXT: or a4, a4, s1
-; RV32I-NEXT: slli s1, s0, 1
-; RV32I-NEXT: sll s1, s1, t2
-; RV32I-NEXT: or s3, a4, a5
-; RV32I-NEXT: slli a4, s3, 1
-; RV32I-NEXT: sll t2, a4, t2
-; RV32I-NEXT: srl a4, t5, a3
-; RV32I-NEXT: srl a5, t0, a3
-; RV32I-NEXT: srl t0, t3, a3
-; RV32I-NEXT: srl a6, a6, a3
-; RV32I-NEXT: srl t3, s0, a3
-; RV32I-NEXT: srl t4, t4, a3
-; RV32I-NEXT: srl t5, t6, a3
-; RV32I-NEXT: sra a3, s3, a3
-; RV32I-NEXT: srli t6, t5, 16
-; RV32I-NEXT: sb t6, 26(a2)
-; RV32I-NEXT: or t2, t5, t2
-; RV32I-NEXT: sb t5, 24(a2)
-; RV32I-NEXT: srli t5, t5, 8
-; RV32I-NEXT: sb t5, 25(a2)
-; RV32I-NEXT: srli t5, a3, 24
-; RV32I-NEXT: sb t5, 31(a2)
-; RV32I-NEXT: srli t5, a3, 16
-; RV32I-NEXT: sb t5, 30(a2)
-; RV32I-NEXT: sb a3, 28(a2)
-; RV32I-NEXT: srli a3, a3, 8
-; RV32I-NEXT: sb a3, 29(a2)
-; RV32I-NEXT: srli a3, t4, 16
-; RV32I-NEXT: sb a3, 18(a2)
-; RV32I-NEXT: or a3, t4, s1
-; RV32I-NEXT: sb t4, 16(a2)
-; RV32I-NEXT: srli t4, t4, 8
-; RV32I-NEXT: sb t4, 17(a2)
-; RV32I-NEXT: srli t4, t3, 16
-; RV32I-NEXT: sb t4, 22(a2)
-; RV32I-NEXT: or t1, t3, t1
-; RV32I-NEXT: sb t3, 20(a2)
-; RV32I-NEXT: srli t3, t3, 8
-; RV32I-NEXT: sb t3, 21(a2)
-; RV32I-NEXT: srli t3, a6, 16
-; RV32I-NEXT: sb t3, 10(a2)
-; RV32I-NEXT: or t3, a6, s2
-; RV32I-NEXT: sb a6, 8(a2)
-; RV32I-NEXT: srli a6, a6, 8
-; RV32I-NEXT: sb a6, 9(a2)
-; RV32I-NEXT: srli a6, t0, 16
-; RV32I-NEXT: sb a6, 14(a2)
-; RV32I-NEXT: or a6, t0, a7
-; RV32I-NEXT: sb t0, 12(a2)
-; RV32I-NEXT: srli a7, t0, 8
-; RV32I-NEXT: sb a7, 13(a2)
-; RV32I-NEXT: srli a7, a5, 16
-; RV32I-NEXT: sb a7, 2(a2)
-; RV32I-NEXT: or a1, a5, a1
-; RV32I-NEXT: sb a5, 0(a2)
-; RV32I-NEXT: srli a5, a5, 8
-; RV32I-NEXT: sb a5, 1(a2)
-; RV32I-NEXT: srli a5, a4, 16
-; RV32I-NEXT: sb a5, 6(a2)
-; RV32I-NEXT: or a0, a4, a0
-; RV32I-NEXT: sb a4, 4(a2)
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: or t3, a0, t4
+; RV32I-NEXT: or t3, t3, a7
+; RV32I-NEXT: lbu a7, 1(a1)
+; RV32I-NEXT: lbu t4, 0(a1)
+; RV32I-NEXT: lbu t5, 2(a1)
+; RV32I-NEXT: lbu a1, 3(a1)
+; RV32I-NEXT: slli a7, a7, 8
+; RV32I-NEXT: or a7, a7, t4
+; RV32I-NEXT: slli t5, t5, 16
+; RV32I-NEXT: slli a1, a1, 24
+; RV32I-NEXT: or a1, a1, t5
+; RV32I-NEXT: or a7, a1, a7
+; RV32I-NEXT: srai a0, a0, 31
+; RV32I-NEXT: sw a0, 60(sp)
+; RV32I-NEXT: sw a0, 56(sp)
+; RV32I-NEXT: sw a0, 52(sp)
+; RV32I-NEXT: sw a0, 48(sp)
+; RV32I-NEXT: sw a0, 44(sp)
+; RV32I-NEXT: sw a0, 40(sp)
+; RV32I-NEXT: sw a0, 36(sp)
+; RV32I-NEXT: sw a0, 32(sp)
+; RV32I-NEXT: sw t3, 28(sp)
+; RV32I-NEXT: sw t2, 24(sp)
+; RV32I-NEXT: sw t1, 20(sp)
+; RV32I-NEXT: sw t0, 16(sp)
+; RV32I-NEXT: sw a6, 12(sp)
+; RV32I-NEXT: sw a5, 8(sp)
+; RV32I-NEXT: sw a4, 4(sp)
+; RV32I-NEXT: sw a3, 0(sp)
+; RV32I-NEXT: srli a0, a7, 3
+; RV32I-NEXT: andi a0, a0, 28
+; RV32I-NEXT: mv a1, sp
+; RV32I-NEXT: add a4, a1, a0
+; RV32I-NEXT: lw a1, 4(a4)
+; RV32I-NEXT: srl a0, a1, a7
+; RV32I-NEXT: lw a5, 8(a4)
+; RV32I-NEXT: andi a3, a7, 31
+; RV32I-NEXT: xori a6, a3, 31
+; RV32I-NEXT: lw a3, 0(a4)
+; RV32I-NEXT: slli t0, a5, 1
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a0, a0, t0
+; RV32I-NEXT: srl a3, a3, a7
+; RV32I-NEXT: slli a1, a1, 1
+; RV32I-NEXT: lw t0, 12(a4)
+; RV32I-NEXT: lw t1, 16(a4)
+; RV32I-NEXT: sll a1, a1, a6
+; RV32I-NEXT: or a1, a3, a1
+; RV32I-NEXT: srl a3, t0, a7
+; RV32I-NEXT: slli t2, t1, 1
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or a3, a3, t2
+; RV32I-NEXT: srl a5, a5, a7
+; RV32I-NEXT: slli t0, t0, 1
+; RV32I-NEXT: lw t2, 20(a4)
+; RV32I-NEXT: lw t3, 24(a4)
+; RV32I-NEXT: sll t0, t0, a6
+; RV32I-NEXT: or a5, a5, t0
+; RV32I-NEXT: srl t0, t2, a7
+; RV32I-NEXT: slli t4, t3, 1
+; RV32I-NEXT: sll t4, t4, a6
+; RV32I-NEXT: or t0, t0, t4
+; RV32I-NEXT: srl t1, t1, a7
+; RV32I-NEXT: slli t2, t2, 1
+; RV32I-NEXT: lw a4, 28(a4)
+; RV32I-NEXT: sll t2, t2, a6
+; RV32I-NEXT: or t1, t1, t2
+; RV32I-NEXT: srl t2, t3, a7
+; RV32I-NEXT: slli t3, a4, 1
+; RV32I-NEXT: sll a6, t3, a6
+; RV32I-NEXT: or a6, t2, a6
+; RV32I-NEXT: sra a4, a4, a7
+; RV32I-NEXT: sb a4, 28(a2)
+; RV32I-NEXT: srli a7, a4, 24
+; RV32I-NEXT: sb a7, 31(a2)
+; RV32I-NEXT: srli a7, a4, 16
+; RV32I-NEXT: sb a7, 30(a2)
; RV32I-NEXT: srli a4, a4, 8
-; RV32I-NEXT: sb a4, 5(a2)
-; RV32I-NEXT: srli a4, t2, 24
+; RV32I-NEXT: sb a4, 29(a2)
+; RV32I-NEXT: sb a6, 24(a2)
+; RV32I-NEXT: sb t1, 16(a2)
+; RV32I-NEXT: sb t0, 20(a2)
+; RV32I-NEXT: sb a5, 8(a2)
+; RV32I-NEXT: sb a3, 12(a2)
+; RV32I-NEXT: sb a1, 0(a2)
+; RV32I-NEXT: sb a0, 4(a2)
+; RV32I-NEXT: srli a4, a6, 24
; RV32I-NEXT: sb a4, 27(a2)
-; RV32I-NEXT: srli a3, a3, 24
-; RV32I-NEXT: sb a3, 19(a2)
-; RV32I-NEXT: srli a3, t1, 24
-; RV32I-NEXT: sb a3, 23(a2)
-; RV32I-NEXT: srli a3, t3, 24
-; RV32I-NEXT: sb a3, 11(a2)
-; RV32I-NEXT: srli a3, a6, 24
-; RV32I-NEXT: sb a3, 15(a2)
-; RV32I-NEXT: srli a1, a1, 24
-; RV32I-NEXT: sb a1, 3(a2)
-; RV32I-NEXT: srli a0, a0, 24
-; RV32I-NEXT: sb a0, 7(a2)
-; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
-; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
-; RV32I-NEXT: addi sp, sp, 144
+; RV32I-NEXT: srli a4, a6, 16
+; RV32I-NEXT: sb a4, 26(a2)
+; RV32I-NEXT: srli a4, a6, 8
+; RV32I-NEXT: sb a4, 25(a2)
+; RV32I-NEXT: srli a4, t1, 24
+; RV32I-NEXT: sb a4, 19(a2)
+; RV32I-NEXT: srli a4, t1, 16
+; RV32I-NEXT: sb a4, 18(a2)
+; RV32I-NEXT: srli a4, t1, 8
+; RV32I-NEXT: sb a4, 17(a2)
+; RV32I-NEXT: srli a4, t0, 24
+; RV32I-NEXT: sb a4, 23(a2)
+; RV32I-NEXT: srli a4, t0, 16
+; RV32I-NEXT: sb a4, 22(a2)
+; RV32I-NEXT: srli a4, t0, 8
+; RV32I-NEXT: sb a4, 21(a2)
+; RV32I-NEXT: srli a4, a5, 24
+; RV32I-NEXT: sb a4, 11(a2)
+; RV32I-NEXT: srli a4, a5, 16
+; RV32I-NEXT: sb a4, 10(a2)
+; RV32I-NEXT: srli a5, a5, 8
+; RV32I-NEXT: sb a5, 9(a2)
+; RV32I-NEXT: srli a4, a3, 24
+; RV32I-NEXT: sb a4, 15(a2)
+; RV32I-NEXT: srli a4, a3, 16
+; RV32I-NEXT: sb a4, 14(a2)
+; RV32I-NEXT: srli a3, a3, 8
+; RV32I-NEXT: sb a3, 13(a2)
+; RV32I-NEXT: srli a3, a1, 24
+; RV32I-NEXT: sb a3, 3(a2)
+; RV32I-NEXT: srli a3, a1, 16
+; RV32I-NEXT: sb a3, 2(a2)
+; RV32I-NEXT: srli a1, a1, 8
+; RV32I-NEXT: sb a1, 1(a2)
+; RV32I-NEXT: srli a1, a0, 24
+; RV32I-NEXT: sb a1, 7(a2)
+; RV32I-NEXT: srli a1, a0, 16
+; RV32I-NEXT: sb a1, 6(a2)
+; RV32I-NEXT: srli a0, a0, 8
+; RV32I-NEXT: sb a0, 5(a2)
+; RV32I-NEXT: addi sp, sp, 64
; RV32I-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
new file mode 100644
index 000000000000..52048a0a2065
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -0,0 +1,415 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE
+; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX2
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512F
+; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512BW
+
+define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
+; SSE-LABEL: v_test_canonicalize__half:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $16, %rsp
+; SSE-NEXT: movq %rdi, %rbx
+; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: pextrw $0, %xmm0, %eax
+; SSE-NEXT: movw %ax, (%rbx)
+; SSE-NEXT: addq $16, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v_test_canonicalize__half:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: subq $16, %rsp
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX1-NEXT: addq $16, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v_test_canonicalize__half:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $16, %rsp
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX2-NEXT: addq $16, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v_test_canonicalize__half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v_test_canonicalize__half:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: movzwl (%rdi), %eax
+; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: movw %ax, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ %val = load half, half addrspace(1)* %out
+ %canonicalized = call half @llvm.canonicalize.f16(half %val)
+ store half %canonicalized, half addrspace(1)* %out
+ ret void
+}
+
+define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
+; SSE-LABEL: complex_canonicalize_fmul_half:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload
+; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: subss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
+; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: popq %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: complex_canonicalize_fmul_half:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX1-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX1-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload
+; AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX1-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: popq %rax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: complex_canonicalize_fmul_half:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload
+; AVX2-NEXT: # xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
+; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: popq %rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: complex_canonicalize_fmul_half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512F-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vmulss %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: complex_canonicalize_fmul_half:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512BW-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vaddss %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT: vsubss %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT: vmulss %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: retq
+entry:
+
+ %mul1 = fsub half %a, %b
+ %add = fadd half %mul1, %b
+ %mul2 = fsub half %add, %mul1
+ %canonicalized = call half @llvm.canonicalize.f16(half %mul2)
+ %result = fsub half %canonicalized, %b
+ ret half %result
+}
+
+define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind {
+; SSE-LABEL: v_test_canonicalize_v2half:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $48, %rsp
+; SSE-NEXT: movq %rdi, %rbx
+; SSE-NEXT: pinsrw $0, 2(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pinsrw $0, (%rdi), %xmm0
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: mulss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: callq __extendhfsf2@PLT
+; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE-NEXT: callq __truncsfhf2@PLT
+; SSE-NEXT: pextrw $0, %xmm0, %eax
+; SSE-NEXT: movw %ax, 2(%rbx)
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT: pextrw $0, %xmm0, %eax
+; SSE-NEXT: movw %ax, (%rbx)
+; SSE-NEXT: addq $48, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: v_test_canonicalize_v2half:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: subq $48, %rsp
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __extendhfsf2@PLT
+; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX1-NEXT: callq __truncsfhf2@PLT
+; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx)
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX1-NEXT: addq $48, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v_test_canonicalize_v2half:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $48, %rsp
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __extendhfsf2@PLT
+; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVX2-NEXT: callq __truncsfhf2@PLT
+; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx)
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx)
+; AVX2-NEXT: addq $48, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v_test_canonicalize_v2half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vmulss %xmm1, %xmm2, %xmm2
+; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v_test_canonicalize_v2half:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm1
+; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512BW-NEXT: vmulss %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovd %xmm2, %eax
+; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ %val = load <2 x half>, <2 x half> addrspace(1)* %out
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
+ store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
new file mode 100644
index 000000000000..13ea53389411
--- /dev/null
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -0,0 +1,636 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
+; RUN: llc -mtriple=i686-- --mattr=-sse2 < %s | FileCheck %s -check-prefixes=SSE1
+; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE2
+; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1
+; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX2
+; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX512F
+
+define float @canon_fp32_varargsf32(float %a) {
+; SSE1-LABEL: canon_fp32_varargsf32:
+; SSE1: # %bb.0:
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmuls {{[0-9]+}}(%esp)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: canon_fp32_varargsf32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: canon_fp32_varargsf32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+
+ %canonicalized = call float @llvm.canonicalize.f32(float %a)
+ ret float %canonicalized
+}
+
+define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) {
+; SSE1-LABEL: canon_fp32_varargsf80:
+; SSE1: # %bb.0:
+; SSE1-NEXT: fldt {{[0-9]+}}(%esp)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmulp %st, %st(1)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: canon_fp32_varargsf80:
+; SSE2: # %bb.0:
+; SSE2-NEXT: fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT: fld1
+; SSE2-NEXT: fmulp %st, %st(1)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: canon_fp32_varargsf80:
+; AVX1: # %bb.0:
+; AVX1-NEXT: fldt {{[0-9]+}}(%rsp)
+; AVX1-NEXT: fld1
+; AVX1-NEXT: fmulp %st, %st(1)
+; AVX1-NEXT: retq
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %a)
+ ret x86_fp80 %canonicalized
+}
+
+define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
+; SSE1-LABEL: complex_canonicalize_fmul_x86_fp80:
+; SSE1: # %bb.0: # %entry
+; SSE1-NEXT: fldt {{[0-9]+}}(%esp)
+; SSE1-NEXT: fldt {{[0-9]+}}(%esp)
+; SSE1-NEXT: fsub %st(1), %st
+; SSE1-NEXT: fld %st(0)
+; SSE1-NEXT: fadd %st(2), %st
+; SSE1-NEXT: fsubp %st, %st(1)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmulp %st, %st(1)
+; SSE1-NEXT: fsubp %st, %st(1)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: complex_canonicalize_fmul_x86_fp80:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT: fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT: fsub %st(1), %st
+; SSE2-NEXT: fld %st(0)
+; SSE2-NEXT: fadd %st(2), %st
+; SSE2-NEXT: fsubp %st, %st(1)
+; SSE2-NEXT: fld1
+; SSE2-NEXT: fmulp %st, %st(1)
+; SSE2-NEXT: fsubp %st, %st(1)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: complex_canonicalize_fmul_x86_fp80:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: fldt {{[0-9]+}}(%rsp)
+; AVX1-NEXT: fldt {{[0-9]+}}(%rsp)
+; AVX1-NEXT: fsub %st(1), %st
+; AVX1-NEXT: fld %st(0)
+; AVX1-NEXT: fadd %st(2), %st
+; AVX1-NEXT: fsubp %st, %st(1)
+; AVX1-NEXT: fld1
+; AVX1-NEXT: fmulp %st, %st(1)
+; AVX1-NEXT: fsubp %st, %st(1)
+; AVX1-NEXT: retq
+entry:
+
+ %mul1 = fsub x86_fp80 %a, %b
+ %add = fadd x86_fp80 %mul1, %b
+ %mul2 = fsub x86_fp80 %add, %mul1
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %mul2)
+ %result = fsub x86_fp80 %canonicalized, %b
+ ret x86_fp80 %result
+}
+
+define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
+; SSE1-LABEL: canonicalize_fp64:
+; SSE1: # %bb.0: # %start
+; SSE1-NEXT: fldl {{[0-9]+}}(%esp)
+; SSE1-NEXT: fldl {{[0-9]+}}(%esp)
+; SSE1-NEXT: fucom %st(1)
+; SSE1-NEXT: fnstsw %ax
+; SSE1-NEXT: # kill: def $ah killed $ah killed $ax
+; SSE1-NEXT: sahf
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fucom %st(0)
+; SSE1-NEXT: fnstsw %ax
+; SSE1-NEXT: fld %st(1)
+; SSE1-NEXT: ja .LBB3_2
+; SSE1-NEXT: # %bb.1: # %start
+; SSE1-NEXT: fstp %st(0)
+; SSE1-NEXT: fldz
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: .LBB3_2: # %start
+; SSE1-NEXT: fstp %st(1)
+; SSE1-NEXT: # kill: def $ah killed $ah killed $ax
+; SSE1-NEXT: sahf
+; SSE1-NEXT: jp .LBB3_4
+; SSE1-NEXT: # %bb.3: # %start
+; SSE1-NEXT: fstp %st(1)
+; SSE1-NEXT: fldz
+; SSE1-NEXT: .LBB3_4: # %start
+; SSE1-NEXT: fstp %st(0)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmulp %st, %st(1)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: canonicalize_fp64:
+; SSE2: # %bb.0: # %start
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm1, %xmm3
+; SSE2-NEXT: maxsd %xmm0, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm3, %xmm2
+; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: canonicalize_fp64:
+; AVX2: # %bb.0: # %start
+; AVX2-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: canonicalize_fp64:
+; AVX512F: # %bb.0: # %start
+; AVX512F-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
+; AVX512F-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512F-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512F-NEXT: retq
+start:
+
+ %c = fcmp olt double %a, %b
+ %d = fcmp uno double %a, 0.000000e+00
+ %or.cond.i.i = or i1 %d, %c
+ %e = select i1 %or.cond.i.i, double %b, double %a
+ %f = tail call double @llvm.canonicalize.f64(double %e) #2
+ ret double %f
+}
+
+define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
+; SSE1-LABEL: canonicalize_fp32:
+; SSE1: # %bb.0: # %start
+; SSE1-NEXT: flds {{[0-9]+}}(%esp)
+; SSE1-NEXT: flds {{[0-9]+}}(%esp)
+; SSE1-NEXT: fucom %st(1)
+; SSE1-NEXT: fnstsw %ax
+; SSE1-NEXT: # kill: def $ah killed $ah killed $ax
+; SSE1-NEXT: sahf
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fucom %st(0)
+; SSE1-NEXT: fnstsw %ax
+; SSE1-NEXT: fld %st(1)
+; SSE1-NEXT: ja .LBB4_2
+; SSE1-NEXT: # %bb.1: # %start
+; SSE1-NEXT: fstp %st(0)
+; SSE1-NEXT: fldz
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: .LBB4_2: # %start
+; SSE1-NEXT: fstp %st(1)
+; SSE1-NEXT: # kill: def $ah killed $ah killed $ax
+; SSE1-NEXT: sahf
+; SSE1-NEXT: jp .LBB4_4
+; SSE1-NEXT: # %bb.3: # %start
+; SSE1-NEXT: fstp %st(1)
+; SSE1-NEXT: fldz
+; SSE1-NEXT: .LBB4_4: # %start
+; SSE1-NEXT: fstp %st(0)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmulp %st, %st(1)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: canonicalize_fp32:
+; SSE2: # %bb.0: # %start
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: canonicalize_fp32:
+; AVX2: # %bb.0: # %start
+; AVX2-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: canonicalize_fp32:
+; AVX512F: # %bb.0: # %start
+; AVX512F-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512F-NEXT: retq
+start:
+
+ %cc = fcmp olt float %aa, %bb
+ %dd = fcmp uno float %aa, 0.000000e+00
+ %or.cond.i.i.x = or i1 %dd, %cc
+ %ee = select i1 %or.cond.i.i.x, float %bb, float %aa
+ %ff = tail call float @llvm.canonicalize.f32(float %ee) #2
+ ret float %ff
+}
+
+define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+; SSE1-LABEL: v_test_canonicalize_var_f32:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmuls (%eax)
+; SSE1-NEXT: fstps (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: v_test_canonicalize_var_f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: v_test_canonicalize_var_f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovss %xmm0, (%rdi)
+; AVX1-NEXT: retq
+ %val = load float, float addrspace(1)* %out
+ %canonicalized = call float @llvm.canonicalize.f32(float %val)
+ store float %canonicalized, float addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
+; SSE1-LABEL: v_test_canonicalize_x86_fp80:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fldt (%eax)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmulp %st, %st(1)
+; SSE1-NEXT: fstpt (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: v_test_canonicalize_x86_fp80:
+; SSE2: # %bb.0:
+; SSE2-NEXT: fldt (%rdi)
+; SSE2-NEXT: fld1
+; SSE2-NEXT: fmulp %st, %st(1)
+; SSE2-NEXT: fstpt (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: v_test_canonicalize_x86_fp80:
+; AVX1: # %bb.0:
+; AVX1-NEXT: fldt (%rdi)
+; AVX1-NEXT: fld1
+; AVX1-NEXT: fmulp %st, %st(1)
+; AVX1-NEXT: fstpt (%rdi)
+; AVX1-NEXT: retq
+
+ %val = load x86_fp80, x86_fp80 addrspace(1)* %out
+ %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val)
+ store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out
+ ret void
+}
+
+define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+; SSE1-LABEL: v_test_canonicalize_var_f64:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmull (%eax)
+; SSE1-NEXT: fstpl (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: v_test_canonicalize_var_f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movsd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: v_test_canonicalize_var_f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovsd %xmm0, (%rdi)
+; AVX1-NEXT: retq
+
+ %val = load double, double addrspace(1)* %out
+ %canonicalized = call double @llvm.canonicalize.f64(double %val)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define void @canonicalize_undef(double addrspace(1)* %out) {
+; SSE1-LABEL: canonicalize_undef:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: movl $2146959360, 4(%eax) # imm = 0x7FF80000
+; SSE1-NEXT: movl $0, (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: canonicalize_undef:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; SSE2-NEXT: movq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: canonicalize_undef:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; AVX1-NEXT: movq %rax, (%rdi)
+; AVX1-NEXT: retq
+
+ %canonicalized = call double @llvm.canonicalize.f64(double undef)
+ store double %canonicalized, double addrspace(1)* %out
+ ret void
+}
+
+define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) {
+; SSE1-LABEL: canon_fp32_varargsv4f32:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fld %st(0)
+; SSE1-NEXT: fmuls {{[0-9]+}}(%esp)
+; SSE1-NEXT: fld %st(1)
+; SSE1-NEXT: fmuls {{[0-9]+}}(%esp)
+; SSE1-NEXT: fld %st(2)
+; SSE1-NEXT: fmuls {{[0-9]+}}(%esp)
+; SSE1-NEXT: fxch %st(3)
+; SSE1-NEXT: fmuls {{[0-9]+}}(%esp)
+; SSE1-NEXT: fstps 12(%eax)
+; SSE1-NEXT: fxch %st(2)
+; SSE1-NEXT: fstps 8(%eax)
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fstps 4(%eax)
+; SSE1-NEXT: fstps (%eax)
+; SSE1-NEXT: retl $4
+;
+; SSE2-LABEL: canon_fp32_varargsv4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: canon_fp32_varargsv4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: canon_fp32_varargsv4f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+ %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a)
+ ret <4 x float> %canonicalized
+}
+
+define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) {
+; SSE1-LABEL: canon_fp64_varargsv4f64:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fld %st(0)
+; SSE1-NEXT: fmull {{[0-9]+}}(%esp)
+; SSE1-NEXT: fld %st(1)
+; SSE1-NEXT: fmull {{[0-9]+}}(%esp)
+; SSE1-NEXT: fld %st(2)
+; SSE1-NEXT: fmull {{[0-9]+}}(%esp)
+; SSE1-NEXT: fxch %st(3)
+; SSE1-NEXT: fmull {{[0-9]+}}(%esp)
+; SSE1-NEXT: fstpl 24(%eax)
+; SSE1-NEXT: fxch %st(2)
+; SSE1-NEXT: fstpl 16(%eax)
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fstpl 8(%eax)
+; SSE1-NEXT: fstpl (%eax)
+; SSE1-NEXT: retl $4
+;
+; SSE2-LABEL: canon_fp64_varargsv4f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0]
+; SSE2-NEXT: mulpd %xmm2, %xmm0
+; SSE2-NEXT: mulpd %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: canon_fp64_varargsv4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: canon_fp64_varargsv4f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512F-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+ %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %a)
+ ret <4 x double> %canonicalized
+}
+
+define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) {
+; SSE1-LABEL: canon_fp80_varargsv2fp80:
+; SSE1: # %bb.0:
+; SSE1-NEXT: fldt {{[0-9]+}}(%esp)
+; SSE1-NEXT: fldt {{[0-9]+}}(%esp)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmul %st, %st(1)
+; SSE1-NEXT: fmulp %st, %st(2)
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: canon_fp80_varargsv2fp80:
+; SSE2: # %bb.0:
+; SSE2-NEXT: fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT: fldt {{[0-9]+}}(%rsp)
+; SSE2-NEXT: fld1
+; SSE2-NEXT: fmul %st, %st(1)
+; SSE2-NEXT: fmulp %st, %st(2)
+; SSE2-NEXT: fxch %st(1)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: canon_fp80_varargsv2fp80:
+; AVX1: # %bb.0:
+; AVX1-NEXT: fldt {{[0-9]+}}(%rsp)
+; AVX1-NEXT: fldt {{[0-9]+}}(%rsp)
+; AVX1-NEXT: fld1
+; AVX1-NEXT: fmul %st, %st(1)
+; AVX1-NEXT: fmulp %st, %st(2)
+; AVX1-NEXT: fxch %st(1)
+; AVX1-NEXT: retq
+ %canonicalized = call <2 x x86_fp80> @llvm.canonicalize.v2f80(<2 x x86_fp80> %a)
+ ret <2 x x86_fp80> %canonicalized
+}
+
+define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 {
+; SSE1-LABEL: vec_canonicalize_var_v4f32:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fld %st(0)
+; SSE1-NEXT: fmuls (%eax)
+; SSE1-NEXT: fld %st(1)
+; SSE1-NEXT: fmuls 4(%eax)
+; SSE1-NEXT: fld %st(2)
+; SSE1-NEXT: fmuls 8(%eax)
+; SSE1-NEXT: fxch %st(3)
+; SSE1-NEXT: fmuls 12(%eax)
+; SSE1-NEXT: fstps 12(%eax)
+; SSE1-NEXT: fxch %st(2)
+; SSE1-NEXT: fstps 8(%eax)
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fstps 4(%eax)
+; SSE1-NEXT: fstps (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: vec_canonicalize_var_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: movaps %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: vec_canonicalize_var_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT: vmulps (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vec_canonicalize_var_v4f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512F-NEXT: vmulps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vmovaps %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+ %val = load <4 x float>, <4 x float> addrspace(1)* %out
+ %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %val)
+ store <4 x float> %canonicalized, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 {
+; SSE1-LABEL: vec_canonicalize_var_v4f64:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fld %st(0)
+; SSE1-NEXT: fmull (%eax)
+; SSE1-NEXT: fld %st(1)
+; SSE1-NEXT: fmull 8(%eax)
+; SSE1-NEXT: fld %st(2)
+; SSE1-NEXT: fmull 16(%eax)
+; SSE1-NEXT: fxch %st(3)
+; SSE1-NEXT: fmull 24(%eax)
+; SSE1-NEXT: fstpl 24(%eax)
+; SSE1-NEXT: fxch %st(2)
+; SSE1-NEXT: fstpl 16(%eax)
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fstpl 8(%eax)
+; SSE1-NEXT: fstpl (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: vec_canonicalize_var_v4f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
+; SSE2-NEXT: movapd 16(%rdi), %xmm1
+; SSE2-NEXT: mulpd %xmm0, %xmm1
+; SSE2-NEXT: mulpd (%rdi), %xmm0
+; SSE2-NEXT: movapd %xmm0, (%rdi)
+; SSE2-NEXT: movapd %xmm1, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: vec_canonicalize_var_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT: vmulpd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vmovapd %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vec_canonicalize_var_v4f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512F-NEXT: vmulpd (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vmovapd %ymm0, (%rdi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+ %val = load <4 x double>, <4 x double> addrspace(1)* %out
+ %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %val)
+ store <4 x double> %canonicalized, <4 x double> addrspace(1)* %out
+ ret void
+}
+
+define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 {
+; SSE1-LABEL: vec_canonicalize_x86_fp80:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE1-NEXT: fldt 30(%eax)
+; SSE1-NEXT: fldt 20(%eax)
+; SSE1-NEXT: fldt 10(%eax)
+; SSE1-NEXT: fldt (%eax)
+; SSE1-NEXT: fld1
+; SSE1-NEXT: fmul %st, %st(1)
+; SSE1-NEXT: fmul %st, %st(2)
+; SSE1-NEXT: fmul %st, %st(3)
+; SSE1-NEXT: fmulp %st, %st(4)
+; SSE1-NEXT: fxch %st(3)
+; SSE1-NEXT: fstpt 30(%eax)
+; SSE1-NEXT: fxch %st(1)
+; SSE1-NEXT: fstpt 20(%eax)
+; SSE1-NEXT: fstpt 10(%eax)
+; SSE1-NEXT: fstpt (%eax)
+; SSE1-NEXT: retl
+;
+; SSE2-LABEL: vec_canonicalize_x86_fp80:
+; SSE2: # %bb.0:
+; SSE2-NEXT: fldt 30(%rdi)
+; SSE2-NEXT: fldt 20(%rdi)
+; SSE2-NEXT: fldt 10(%rdi)
+; SSE2-NEXT: fldt (%rdi)
+; SSE2-NEXT: fld1
+; SSE2-NEXT: fmul %st, %st(1)
+; SSE2-NEXT: fmul %st, %st(2)
+; SSE2-NEXT: fmul %st, %st(3)
+; SSE2-NEXT: fmulp %st, %st(4)
+; SSE2-NEXT: fxch %st(3)
+; SSE2-NEXT: fstpt 30(%rdi)
+; SSE2-NEXT: fxch %st(1)
+; SSE2-NEXT: fstpt 20(%rdi)
+; SSE2-NEXT: fstpt 10(%rdi)
+; SSE2-NEXT: fstpt (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: vec_canonicalize_x86_fp80:
+; AVX1: # %bb.0:
+; AVX1-NEXT: fldt 30(%rdi)
+; AVX1-NEXT: fldt 20(%rdi)
+; AVX1-NEXT: fldt 10(%rdi)
+; AVX1-NEXT: fldt (%rdi)
+; AVX1-NEXT: fld1
+; AVX1-NEXT: fmul %st, %st(1)
+; AVX1-NEXT: fmul %st, %st(2)
+; AVX1-NEXT: fmul %st, %st(3)
+; AVX1-NEXT: fmulp %st, %st(4)
+; AVX1-NEXT: fxch %st(3)
+; AVX1-NEXT: fstpt 30(%rdi)
+; AVX1-NEXT: fxch %st(1)
+; AVX1-NEXT: fstpt 20(%rdi)
+; AVX1-NEXT: fstpt 10(%rdi)
+; AVX1-NEXT: fstpt (%rdi)
+; AVX1-NEXT: retq
+ %val = load <4 x x86_fp80>, <4 x x86_fp80> addrspace(1)* %out
+ %canonicalized = call <4 x x86_fp80> @llvm.canonicalize.f80(<4 x x86_fp80> %val)
+ store <4 x x86_fp80> %canonicalized, <4 x x86_fp80> addrspace(1)* %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1d3b015f3c54..c350ed64280d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -174,22 +174,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
; X86: # %bb.0: # %_udiv-special-cases
; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $156, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $176, %esp
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 16(%ebp), %edx
; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: xorl %eax, %ecx
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -198,32 +199,33 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: sarl $31, %edx
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl 32(%ebp), %ebx
; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: xorl %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 28(%ebp), %edi
; X86-NEXT: xorl %edx, %edi
; X86-NEXT: subl %edx, %edi
-; X86-NEXT: sbbl %edx, %ebp
; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %esi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %edx
@@ -232,359 +234,357 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebp, %edx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: bsrl %edi, %edi
; X86-NEXT: xorl $31, %edi
; X86-NEXT: orl $32, %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebp, %ebp
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %edi
; X86-NEXT: orl $64, %edi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: bsrl %ebp, %ecx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %eax, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: xorl %ebx, %ebx
; X86-NEXT: subl %edx, %edi
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
-; X86-NEXT: movl $0, %eax
-; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $127, %ecx
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: cmpl %edi, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ecx
-; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %edx
-; X86-NEXT: cmovnel %ebx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: cmovnel %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovnel %ebx, %eax
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: jne .LBB4_8
-; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl $127, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.8: # %_udiv-special-cases
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl $127, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: je .LBB4_8
-; X86-NEXT: # %bb.2: # %udiv-bb1
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.5: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: xorb $127, %al
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 148(%esp,%edi), %edx
-; X86-NEXT: movl 152(%esp,%edi), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 144(%esp,%edi), %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: shrl %ebp
-; X86-NEXT: shrl %cl, %ebp
-; X86-NEXT: orl %edx, %ebp
-; X86-NEXT: movl 140(%esp,%edi), %edx
-; X86-NEXT: movb %ch, %cl
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 152(%esp,%eax), %esi
+; X86-NEXT: movl 156(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%eax), %edx
+; X86-NEXT: movl 148(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: jae .LBB4_3
+; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %edi, %edi
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_3: # %udiv-preheader
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB4_7
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB4_9
+; X86-NEXT: .LBB4_2: # %udiv-preheader
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 108(%esp,%eax), %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %dl, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: movb %dl, %cl
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %edx
-; X86-NEXT: movl 104(%esp,%edx), %ebx
-; X86-NEXT: movl 100(%esp,%edx), %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %ebx, %ebp
-; X86-NEXT: movl 92(%esp,%edx), %esi
+; X86-NEXT: movl 104(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%edx), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl 96(%esp,%eax), %esi
+; X86-NEXT: movl 100(%esp,%eax), %eax
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrdl %cl, %ebx, %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $-1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB4_4: # %udiv-do-while
+; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebp
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: shldl $1, %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: orl %esi, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: sbbl %edi, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %esi, %ebx
+; X86-NEXT: subl %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: jne .LBB4_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB4_3
+; X86-NEXT: # %bb.4:
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebp, %edx
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: addl %edi, %edx
; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %eax, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: .LBB4_8: # %udiv-end
+; X86-NEXT: .LBB4_9: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: xorl %ecx, %ebp
-; X86-NEXT: xorl %ecx, %eax
; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: subl %ecx, %edx
; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: sbbl %ecx, %ebx
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl %ebx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ecx, %ebp
-; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebp, 8(%ecx)
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %edi
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: imull %eax, %ebx
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 40(%ebp), %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %esi
+; X86-NEXT: imull %edx, %ebx
; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: addl $156, %esp
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 58ea70e58028..16dc1d6b446c 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -174,379 +174,370 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
; X86: # %bb.0: # %_udiv-special-cases
; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $136, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: orl 36(%ebp), %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sete %bl
-; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: sete %cl
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: orl 24(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: orl 20(%ebp), %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
-; X86-NEXT: orb %bl, %al
-; X86-NEXT: movb %al, (%esp) # 1-byte Spill
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: bsrl 36(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %edi, %edi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebp, %ebp
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: xorl $31, %ebp
-; X86-NEXT: orl $32, %ebp
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: cmovnel %edx, %ebp
-; X86-NEXT: orl $64, %ebp
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: cmovnel %ecx, %ebp
-; X86-NEXT: bsrl %esi, %edx
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: bsrl %ecx, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 16(%ebp), %edi
; X86-NEXT: bsrl %edi, %esi
; X86-NEXT: xorl $31, %esi
-; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: bsrl 12(%ebp), %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: subl %edx, %ebp
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $127, %ecx
-; X86-NEXT: cmpl %ebp, %ecx
+; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: setb %cl
-; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: xorl $127, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl $0, %edi
-; X86-NEXT: cmovnel %edi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: cmovnel %edi, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmovnel %edi, %ebx
-; X86-NEXT: orb %cl, %al
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb %cl, %ah
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: cmovnel %esi, %ebx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: cmovnel %esi, %ecx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: cmovnel %edx, %esi
+; X86-NEXT: movl 12(%ebp), %edi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: orb %ah, %al
+; X86-NEXT: movl 44(%ebp), %eax
; X86-NEXT: jne .LBB4_7
; X86-NEXT: # %bb.1: # %udiv-bb1
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: xorb $127, %al
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 128(%esp,%eax), %edx
-; X86-NEXT: movl 132(%esp,%eax), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl 136(%esp,%eax), %edi
+; X86-NEXT: movl 140(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edi, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 124(%esp,%eax), %ebp
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl 120(%esp,%eax), %eax
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %eax, %ebp
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl 128(%esp,%eax), %ebx
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: addl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 20(%ebp), %ebx
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.5:
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: jmp .LBB4_6
; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %edx
; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl 16(%ebp), %edx
; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
+; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 84(%esp,%eax), %ebx
+; X86-NEXT: movl 92(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esp,%eax), %esi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%eax), %ebp
-; X86-NEXT: movl 76(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: notb %cl
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl 88(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %edi
+; X86-NEXT: movl 84(%esp,%eax), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shrdl %cl, %edx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 32(%ebp), %eax
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: shldl $1, %ebp, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ebp
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %edx, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebp, %ecx
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl 40(%ebp), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: andl 36(%ebp), %eax
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl 32(%ebp), %edx
+; X86-NEXT: andl 28(%ebp), %ecx
+; X86-NEXT: subl %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edi, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: movl (%esp), %edi # 4-byte Reload
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: shldl $1, %ebp, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %ebp
-; X86-NEXT: orl %ecx, %ebp
-; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: .LBB4_6: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl 44(%ebp), %eax
; X86-NEXT: .LBB4_7: # %udiv-end
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %ebp, 4(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %ebp, %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: imull %edx, %esi
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: imull %ecx, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %edi
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: imull %ecx, %edi
; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull 28(%ebp), %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: imull %edx, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull 32(%ebp)
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull 32(%ebp)
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sbbl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %ecx
; X86-NEXT: sbbl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %esi, (%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: addl $136, %esp
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 6fcebdb5116d..fb169a3777fb 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,7 +22,7 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $176, %esp
+; X86-NEXT: subl $160, %esp
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -96,18 +96,16 @@ define void @f() nounwind {
; X86-NEXT: addl $1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: andl $3, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
-; X86-NEXT: movb %cl, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: negb %cl
-; X86-NEXT: movsbl %cl, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -117,29 +115,24 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esp,%esi), %edi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 128(%esp,%esi), %ebx
-; X86-NEXT: movl 132(%esp,%esi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shrl %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl 112(%esp,%esi), %edi
+; X86-NEXT: movl 116(%esp,%esi), %eax
+; X86-NEXT: movl 120(%esp,%esi), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: je .LBB0_13
; X86-NEXT: # %bb.11: # %udiv-preheader
-; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl %esi, %edi
; X86-NEXT: andl $3, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -148,26 +141,20 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movb %al, %ch
-; X86-NEXT: andb $7, %ch
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $15, %al
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movl 80(%esp,%edx), %edi
-; X86-NEXT: movl 84(%esp,%edx), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: notb %cl
-; X86-NEXT: movl 88(%esp,%edx), %ebx
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 72(%esp,%eax), %ebx
+; X86-NEXT: movl 64(%esp,%eax), %esi
+; X86-NEXT: movl 68(%esp,%eax), %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
@@ -175,70 +162,69 @@ define void @f() nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $3, %eax
-; X86-NEXT: andl $3, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $3, %edi
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB0_12: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: andl $2, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: leal (%eax,%edi,2), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: andl $2, %edx
+; X86-NEXT: shrl %edx
+; X86-NEXT: leal (%edx,%ebx,2), %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $3, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: shll $30, %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: sarl $30, %eax
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: shrdl $1, %esi, %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: andl $1, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $3, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: shll $30, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sarl $30, %edi
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: shrdl $1, %edx, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %edx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %ebx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: adcl $-1, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $3, %esi
-; X86-NEXT: andl $3, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $3, %edi
+; X86-NEXT: andl $3, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edx, %eax
; X86-NEXT: jne .LBB0_12
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index df3c25a8c42a..6be79edbe51e 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -13,26 +13,24 @@ define i256 @test1(i256 %a) nounwind {
; ILP-LABEL: test1:
; ILP: # %bb.0:
; ILP-NEXT: movq %rdi, %rax
+; ILP-NEXT: xorps %xmm0, %xmm0
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: leal (%rsi,%rsi), %ecx
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; ILP-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: addb $3, %cl
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
-; ILP-NEXT: andb $7, %cl
+; ILP-NEXT: andb $24, %dl
; ILP-NEXT: negb %dl
; ILP-NEXT: movsbq %dl, %rdx
-; ILP-NEXT: movq -16(%rsp,%rdx), %rsi
-; ILP-NEXT: movq -8(%rsp,%rdx), %rdi
+; ILP-NEXT: movq -24(%rsp,%rdx), %rsi
+; ILP-NEXT: movq -16(%rsp,%rdx), %rdi
; ILP-NEXT: shldq %cl, %rsi, %rdi
-; ILP-NEXT: movq -32(%rsp,%rdx), %r8
-; ILP-NEXT: movq -24(%rsp,%rdx), %rdx
+; ILP-NEXT: movq -40(%rsp,%rdx), %r8
+; ILP-NEXT: movq -32(%rsp,%rdx), %rdx
; ILP-NEXT: movq %r8, %r9
; ILP-NEXT: shlq %cl, %r9
; ILP-NEXT: movq %rdx, %r10
@@ -52,27 +50,25 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-LABEL: test1:
; HYBRID: # %bb.0:
; HYBRID-NEXT: movq %rdi, %rax
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: xorps %xmm0, %xmm0
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; HYBRID-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: addl %esi, %esi
-; HYBRID-NEXT: addb $3, %sil
-; HYBRID-NEXT: movl %esi, %ecx
-; HYBRID-NEXT: andb $7, %cl
-; HYBRID-NEXT: shrb $3, %sil
-; HYBRID-NEXT: negb %sil
-; HYBRID-NEXT: movsbq %sil, %rdx
-; HYBRID-NEXT: movq -16(%rsp,%rdx), %rsi
-; HYBRID-NEXT: movq -8(%rsp,%rdx), %rdi
+; HYBRID-NEXT: leal (%rsi,%rsi), %ecx
+; HYBRID-NEXT: addb $3, %cl
+; HYBRID-NEXT: movl %ecx, %edx
+; HYBRID-NEXT: shrb $3, %dl
+; HYBRID-NEXT: andb $24, %dl
+; HYBRID-NEXT: negb %dl
+; HYBRID-NEXT: movsbq %dl, %rdx
+; HYBRID-NEXT: movq -24(%rsp,%rdx), %rsi
+; HYBRID-NEXT: movq -16(%rsp,%rdx), %rdi
; HYBRID-NEXT: shldq %cl, %rsi, %rdi
; HYBRID-NEXT: movq %rdi, 24(%rax)
-; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdi
-; HYBRID-NEXT: movq -24(%rsp,%rdx), %rdx
+; HYBRID-NEXT: movq -40(%rsp,%rdx), %rdi
+; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdx
; HYBRID-NEXT: movq %rdx, %r8
; HYBRID-NEXT: shldq %cl, %rdi, %r8
; HYBRID-NEXT: movq %r8, 8(%rax)
@@ -81,6 +77,7 @@ define i256 @test1(i256 %a) nounwind {
; HYBRID-NEXT: shlq %cl, %rsi
; HYBRID-NEXT: notb %cl
; HYBRID-NEXT: shrq %rdx
+; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx
; HYBRID-NEXT: shrq %cl, %rdx
; HYBRID-NEXT: orq %rsi, %rdx
; HYBRID-NEXT: movq %rdx, 16(%rax)
@@ -89,27 +86,25 @@ define i256 @test1(i256 %a) nounwind {
; BURR-LABEL: test1:
; BURR: # %bb.0:
; BURR-NEXT: movq %rdi, %rax
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: xorps %xmm0, %xmm0
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; BURR-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: addl %esi, %esi
-; BURR-NEXT: addb $3, %sil
-; BURR-NEXT: movl %esi, %ecx
-; BURR-NEXT: andb $7, %cl
-; BURR-NEXT: shrb $3, %sil
-; BURR-NEXT: negb %sil
-; BURR-NEXT: movsbq %sil, %rdx
-; BURR-NEXT: movq -16(%rsp,%rdx), %rsi
-; BURR-NEXT: movq -8(%rsp,%rdx), %rdi
+; BURR-NEXT: leal (%rsi,%rsi), %ecx
+; BURR-NEXT: addb $3, %cl
+; BURR-NEXT: movl %ecx, %edx
+; BURR-NEXT: shrb $3, %dl
+; BURR-NEXT: andb $24, %dl
+; BURR-NEXT: negb %dl
+; BURR-NEXT: movsbq %dl, %rdx
+; BURR-NEXT: movq -24(%rsp,%rdx), %rsi
+; BURR-NEXT: movq -16(%rsp,%rdx), %rdi
; BURR-NEXT: shldq %cl, %rsi, %rdi
; BURR-NEXT: movq %rdi, 24(%rax)
-; BURR-NEXT: movq -32(%rsp,%rdx), %rdi
-; BURR-NEXT: movq -24(%rsp,%rdx), %rdx
+; BURR-NEXT: movq -40(%rsp,%rdx), %rdi
+; BURR-NEXT: movq -32(%rsp,%rdx), %rdx
; BURR-NEXT: movq %rdx, %r8
; BURR-NEXT: shldq %cl, %rdi, %r8
; BURR-NEXT: movq %r8, 8(%rax)
@@ -118,6 +113,7 @@ define i256 @test1(i256 %a) nounwind {
; BURR-NEXT: shlq %cl, %rsi
; BURR-NEXT: notb %cl
; BURR-NEXT: shrq %rdx
+; BURR-NEXT: # kill: def $cl killed $cl killed $ecx
; BURR-NEXT: shrq %cl, %rdx
; BURR-NEXT: orq %rsi, %rdx
; BURR-NEXT: movq %rdx, 16(%rax)
@@ -126,33 +122,31 @@ define i256 @test1(i256 %a) nounwind {
; SRC-LABEL: test1:
; SRC: # %bb.0:
; SRC-NEXT: movq %rdi, %rax
-; SRC-NEXT: addl %esi, %esi
-; SRC-NEXT: addb $3, %sil
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: leal (%rsi,%rsi), %edx
+; SRC-NEXT: addb $3, %dl
+; SRC-NEXT: xorps %xmm0, %xmm0
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; SRC-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movl %esi, %edx
-; SRC-NEXT: andb $7, %dl
-; SRC-NEXT: shrb $3, %sil
-; SRC-NEXT: negb %sil
-; SRC-NEXT: movsbq %sil, %rsi
-; SRC-NEXT: movq -16(%rsp,%rsi), %rdi
+; SRC-NEXT: movl %edx, %ecx
+; SRC-NEXT: shrb $3, %cl
+; SRC-NEXT: andb $24, %cl
+; SRC-NEXT: negb %cl
+; SRC-NEXT: movsbq %cl, %rsi
+; SRC-NEXT: movq -24(%rsp,%rsi), %rdi
; SRC-NEXT: movq %rdi, %r8
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shlq %cl, %r8
; SRC-NEXT: notb %cl
-; SRC-NEXT: movq -32(%rsp,%rsi), %r9
-; SRC-NEXT: movq -24(%rsp,%rsi), %r10
+; SRC-NEXT: movq -40(%rsp,%rsi), %r9
+; SRC-NEXT: movq -32(%rsp,%rsi), %r10
; SRC-NEXT: movq %r10, %r11
; SRC-NEXT: shrq %r11
; SRC-NEXT: shrq %cl, %r11
; SRC-NEXT: orq %r8, %r11
-; SRC-NEXT: movq -8(%rsp,%rsi), %rsi
+; SRC-NEXT: movq -16(%rsp,%rsi), %rsi
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shldq %cl, %rdi, %rsi
; SRC-NEXT: movq %r9, %rdi
@@ -171,27 +165,25 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: addb $3, %dl
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shrb $3, %cl
+; LIN-NEXT: andb $24, %cl
; LIN-NEXT: negb %cl
; LIN-NEXT: movsbq %cl, %rsi
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: xorps %xmm0, %xmm0
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq $1, -{{[0-9]+}}(%rsp)
; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq -32(%rsp,%rsi), %rdi
-; LIN-NEXT: andb $7, %dl
+; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; LIN-NEXT: movq -40(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r8
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shlq %cl, %r8
; LIN-NEXT: movq %r8, (%rax)
-; LIN-NEXT: movq -24(%rsp,%rsi), %r8
+; LIN-NEXT: movq -32(%rsp,%rsi), %r8
; LIN-NEXT: movq %r8, %r9
; LIN-NEXT: shldq %cl, %rdi, %r9
; LIN-NEXT: movq %r9, 8(%rax)
-; LIN-NEXT: movq -16(%rsp,%rsi), %rdi
+; LIN-NEXT: movq -24(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r9
; LIN-NEXT: shlq %cl, %r9
; LIN-NEXT: shrq %r8
@@ -199,7 +191,7 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: shrq %cl, %r8
; LIN-NEXT: orq %r9, %r8
; LIN-NEXT: movq %r8, 16(%rax)
-; LIN-NEXT: movq -8(%rsp,%rsi), %rsi
+; LIN-NEXT: movq -16(%rsp,%rsi), %rsi
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shldq %cl, %rdi, %rsi
; LIN-NEXT: movq %rsi, 24(%rax)
diff --git a/llvm/test/CodeGen/X86/section-stats.ll b/llvm/test/CodeGen/X86/section-stats.ll
index 94d0a965ac59..2cab7d18dec0 100644
--- a/llvm/test/CodeGen/X86/section-stats.ll
+++ b/llvm/test/CodeGen/X86/section-stats.ll
@@ -3,6 +3,8 @@
; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC text sections
; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC read-write sections
+; CHECK-DAG: 512 elf-object-writer - Total size of section headers table
+; CHECK-DAG: 64 elf-object-writer - Total size of ELF headers
target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 4fbe05cd1b2f..767bd772ab7a 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -10,49 +10,45 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_lshr_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 8(%ebp), %eax
+; i686-NEXT: movl 12(%ebp), %edx
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, (%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
-; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: movzbl %al, %edi
+; i686-NEXT: movl 8(%esp,%edi), %eax
+; i686-NEXT: movl 4(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: shrdl %cl, %eax, %edx
+; i686-NEXT: movl (%esp,%edi), %esi
+; i686-NEXT: movl 12(%esp,%edi), %edi
+; i686-NEXT: shrdl %cl, %edi, %eax
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shrl %cl, %edi
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %eax, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
+; i686-NEXT: movl %esi, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -81,50 +77,46 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_ashr_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 8(%ebp), %eax
+; i686-NEXT: movl 12(%ebp), %edx
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, (%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: movzbl %cl, %ebp
-; i686-NEXT: movl 4(%esp,%ebp), %edx
-; i686-NEXT: movl %edx, %esi
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 8(%esp,%ebp), %ebx
-; i686-NEXT: leal (%ebx,%ebx), %edi
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %esi, %edi
-; i686-NEXT: movl (%esp,%ebp), %esi
-; i686-NEXT: movl 12(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %ebx
-; i686-NEXT: shrdl %cl, %edx, %esi
-; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %ebp, 12(%eax)
-; i686-NEXT: movl %ebx, 8(%eax)
-; i686-NEXT: movl %esi, (%eax)
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: movzbl %al, %edi
+; i686-NEXT: movl 8(%esp,%edi), %eax
+; i686-NEXT: movl 4(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: shrdl %cl, %eax, %edx
+; i686-NEXT: movl (%esp,%edi), %esi
+; i686-NEXT: movl 12(%esp,%edi), %edi
+; i686-NEXT: shrdl %cl, %edi, %eax
+; i686-NEXT: shrdl %cl, %ebx, %esi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: sarl %cl, %edi
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %eax, 8(%ebx)
+; i686-NEXT: movl %edx, 4(%ebx)
+; i686-NEXT: movl %esi, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -154,15 +146,17 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-LABEL: test_shl_i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $32, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $48, %esp
+; i686-NEXT: movl 24(%ebp), %ecx
+; i686-NEXT: movl 8(%ebp), %eax
+; i686-NEXT: movl 12(%ebp), %edx
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -172,36 +166,27 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind {
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, (%esp)
; i686-NEXT: movl %ecx, %eax
-; i686-NEXT: andb $7, %al
-; i686-NEXT: shrb $3, %cl
-; i686-NEXT: andb $15, %cl
-; i686-NEXT: negb %cl
-; i686-NEXT: movsbl %cl, %ebp
-; i686-NEXT: movl 24(%esp,%ebp), %ebx
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: notb %cl
-; i686-NEXT: movl 20(%esp,%ebp), %edi
-; i686-NEXT: movl %edi, %esi
-; i686-NEXT: shrl %esi
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
-; i686-NEXT: movl 16(%esp,%ebp), %edx
-; i686-NEXT: movl 28(%esp,%ebp), %ebp
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shldl %cl, %ebx, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl %edx, %ebx
-; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: shll %cl, %ebx
-; i686-NEXT: shldl %cl, %edx, %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl %edi, 4(%eax)
-; i686-NEXT: movl %ebx, (%eax)
-; i686-NEXT: movl %esi, 8(%eax)
-; i686-NEXT: addl $32, %esp
+; i686-NEXT: shrb $3, %al
+; i686-NEXT: andb $12, %al
+; i686-NEXT: negb %al
+; i686-NEXT: movsbl %al, %edi
+; i686-NEXT: movl 20(%esp,%edi), %eax
+; i686-NEXT: movl 24(%esp,%edi), %ebx
+; i686-NEXT: movl %ebx, %esi
+; i686-NEXT: shldl %cl, %eax, %esi
+; i686-NEXT: movl 16(%esp,%edi), %edx
+; i686-NEXT: movl 28(%esp,%edi), %edi
+; i686-NEXT: shldl %cl, %ebx, %edi
+; i686-NEXT: movl 40(%ebp), %ebx
+; i686-NEXT: movl %edi, 12(%ebx)
+; i686-NEXT: movl %esi, 8(%ebx)
+; i686-NEXT: movl %edx, %esi
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shldl %cl, %edx, %eax
+; i686-NEXT: movl %eax, 4(%ebx)
+; i686-NEXT: movl %esi, (%ebx)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -264,104 +249,93 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-LABEL: test_lshr_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $112, %esp
+; i686-NEXT: movl 40(%ebp), %edx
+; i686-NEXT: movl 24(%ebp), %eax
+; i686-NEXT: movl 28(%ebp), %ecx
+; i686-NEXT: movl 32(%ebp), %esi
+; i686-NEXT: movl 20(%ebp), %edi
; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 36(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %esi, %ecx
-; i686-NEXT: andl $7, %ecx
+; i686-NEXT: movl %edx, %ebx
+; i686-NEXT: andl $31, %ebx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 40(%esp,%edx), %eax
+; i686-NEXT: movl 36(%esp,%edx), %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrdl %cl, %eax, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 32(%esp,%edx), %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl $3, %esi
-; i686-NEXT: andl $15, %esi
-; i686-NEXT: movl 40(%esp,%esi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 44(%esp,%esi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
+; i686-NEXT: movl 44(%esp,%edx), %edx
; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 36(%esp,%esi), %eax
+; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: movl %ebx, %esi
+; i686-NEXT: shrdl %cl, %edx, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %ebx
-; i686-NEXT: andl $15, %ebx
-; i686-NEXT: movl 72(%esp,%ebx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 76(%esp,%ebx), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: leal (%eax,%eax), %edi
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: orl %ebp, %edi
-; i686-NEXT: movl 48(%esp,%esi), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: movl 56(%ebp), %edx
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 72(%esp,%edx), %ebx
+; i686-NEXT: movl 68(%esp,%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %ebx, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 64(%esp,%edx), %edi
+; i686-NEXT: movl 76(%esp,%edx), %edx
+; i686-NEXT: shrdl %cl, %edx, %ebx
+; i686-NEXT: movl %esi, %ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl 68(%esp,%ebx), %ecx
-; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT: movl 80(%esp,%ebx), %esi
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
+; i686-NEXT: # kill: def $cl killed $cl killed $ecx
+; i686-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: movl %eax, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %esi, 28(%ecx)
-; i686-NEXT: movl %ebx, 24(%ecx)
-; i686-NEXT: movl (%esp), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 16(%ecx)
-; i686-NEXT: movl %ebp, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %edi, 20(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: shrl %cl, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl %ebx, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -402,107 +376,96 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-LABEL: test_ashr_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $92, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $112, %esp
+; i686-NEXT: movl 40(%ebp), %edx
+; i686-NEXT: movl 24(%ebp), %eax
+; i686-NEXT: movl 28(%ebp), %ecx
+; i686-NEXT: movl 32(%ebp), %esi
+; i686-NEXT: movl 16(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 20(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 36(%ebp), %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT: sarl $31, %eax
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, %ebx
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: shrl $3, %edi
-; i686-NEXT: andl $15, %edi
-; i686-NEXT: movl 32(%esp,%edi), %eax
-; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shrl %cl, %eax
+; i686-NEXT: sarl $31, %edi
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl %edx, %eax
+; i686-NEXT: andl $31, %eax
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 40(%esp,%edx), %esi
+; i686-NEXT: movl 36(%esp,%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 32(%esp,%edx), %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 44(%esp,%edx), %edx
+; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shrdl %cl, %edx, %esi
+; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 56(%ebp), %edx
+; i686-NEXT: movl %edx, %ebx
+; i686-NEXT: andl $31, %ebx
+; i686-NEXT: shrl $3, %edx
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: movl 72(%esp,%edx), %esi
+; i686-NEXT: movl 68(%esp,%edx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 36(%esp,%edi), %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: addl %edx, %edx
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: orl %eax, %edx
-; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebp, %eax
-; i686-NEXT: movl %ebp, %edx
-; i686-NEXT: andl $7, %edx
-; i686-NEXT: shrl $3, %eax
-; i686-NEXT: andl $15, %eax
-; i686-NEXT: movl 64(%esp,%eax), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %eax, (%esp) # 4-byte Spill
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrl %cl, %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: notl %ecx
-; i686-NEXT: movl 68(%esp,%eax), %esi
-; i686-NEXT: leal (%esi,%esi), %eax
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shll %cl, %eax
-; i686-NEXT: orl %ebp, %eax
-; i686-NEXT: movl 28(%esp,%edi), %ecx
+; i686-NEXT: shrdl %cl, %esi, %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 64(%esp,%edx), %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 40(%esp,%edi), %edi
+; i686-NEXT: movl 76(%esp,%edx), %edx
; i686-NEXT: movl %ebx, %ecx
+; i686-NEXT: shrdl %cl, %edx, %esi
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT: movl 60(%esp,%ecx), %ebp
-; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl 72(%esp,%ecx), %ebp
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: shrdl %cl, %ebp, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill
; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: sarl %cl, %edi
-; i686-NEXT: movl %edx, %ecx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shrdl %cl, %esi, %ebx
-; i686-NEXT: sarl %cl, %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %ebp, 28(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 24(%ecx)
-; i686-NEXT: movl %ebx, 16(%ecx)
-; i686-NEXT: movl %edi, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 8(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 20(%ecx)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 4(%ecx)
-; i686-NEXT: addl $92, %esp
+; i686-NEXT: shrdl %cl, %eax, %edi
+; i686-NEXT: sarl %cl, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl %esi, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %edi, 16(%eax)
+; i686-NEXT: movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
@@ -546,112 +509,106 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-LABEL: test_shl_v2i128:
; i686: # %bb.0: # %entry
; i686-NEXT: pushl %ebp
+; i686-NEXT: movl %esp, %ebp
; i686-NEXT: pushl %ebx
; i686-NEXT: pushl %edi
; i686-NEXT: pushl %esi
-; i686-NEXT: subl $100, %esp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT: movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT: andl $-16, %esp
+; i686-NEXT: subl $128, %esp
+; i686-NEXT: movl 40(%ebp), %edi
+; i686-NEXT: movl 24(%ebp), %eax
+; i686-NEXT: movl 28(%ebp), %ecx
+; i686-NEXT: movl 32(%ebp), %edx
+; i686-NEXT: movl 20(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 16(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 12(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 8(%ebp), %esi
+; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT: movl 36(%ebp), %esi
; i686-NEXT: movl %esi, {{[0-9]+}}(%esp)
; i686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; i686-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT: movl %ebp, %ecx
-; i686-NEXT: shrl $3, %ebp
-; i686-NEXT: andl $15, %ebp
+; i686-NEXT: movl %edi, %ebx
+; i686-NEXT: shrl $3, %ebx
+; i686-NEXT: andl $12, %ebx
; i686-NEXT: leal {{[0-9]+}}(%esp), %eax
-; i686-NEXT: subl %ebp, %eax
+; i686-NEXT: subl %ebx, %eax
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: movl 8(%eax), %edx
-; i686-NEXT: movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT: andl $7, %ecx
+; i686-NEXT: movl (%eax), %esi
+; i686-NEXT: movl 4(%eax), %edx
+; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 8(%eax), %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %edi, %ecx
+; i686-NEXT: andl $31, %ecx
; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shll %cl, %edx
-; i686-NEXT: movl 4(%eax), %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %esi
-; i686-NEXT: notl %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %esi
-; i686-NEXT: orl %edx, %esi
-; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT: movl (%eax), %eax
+; i686-NEXT: shldl %cl, %edx, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %edx
+; i686-NEXT: movl 56(%ebp), %eax
+; i686-NEXT: movl %eax, %edx
; i686-NEXT: shrl $3, %edx
-; i686-NEXT: andl $15, %edx
-; i686-NEXT: leal {{[0-9]+}}(%esp), %esi
-; i686-NEXT: subl %edx, %esi
+; i686-NEXT: andl $12, %edx
+; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; i686-NEXT: subl %edx, %ecx
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
; i686-NEXT: movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT: andl $7, %ebx
-; i686-NEXT: movl 8(%esi), %edi
+; i686-NEXT: movl (%ecx), %edi
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl 4(%esi), %eax
+; i686-NEXT: movl 4(%ecx), %edi
+; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl 8(%ecx), %ecx
+; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: andl $31, %eax
; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: shrl %eax
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: notl %ecx
+; i686-NEXT: movl %ecx, %eax
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: shrl %cl, %eax
-; i686-NEXT: orl %edi, %eax
-; i686-NEXT: movl (%esi), %ecx
-; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
+; i686-NEXT: shldl %cl, %edi, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: movl %esi, %eax
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT: shll %cl, %eax
+; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT: negl %ebp
-; i686-NEXT: movl 64(%esp,%ebp), %esi
-; i686-NEXT: # kill: def $cl killed $cl killed $ecx
-; i686-NEXT: movl (%esp), %edi # 4-byte Reload
-; i686-NEXT: shldl %cl, %edi, %esi
-; i686-NEXT: movl %esi, (%esp) # 4-byte Spill
+; i686-NEXT: negl %ebx
+; i686-NEXT: movl 76(%esp,%ebx), %ebx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT: movl %esi, %edi
-; i686-NEXT: movl %ebx, %ecx
-; i686-NEXT: shll %cl, %edi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; i686-NEXT: shldl %cl, %esi, %ebp
+; i686-NEXT: shldl %cl, %esi, %ebx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT: movl %edi, %esi
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: shll %cl, %esi
+; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %edx
-; i686-NEXT: movl 96(%esp,%edx), %edx
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT: shldl %cl, %ebx, %edx
-; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT: movl %edx, 28(%ecx)
-; i686-NEXT: movl %ebp, 20(%ecx)
-; i686-NEXT: movl %edi, 16(%ecx)
-; i686-NEXT: movl (%esp), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 12(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, 4(%ecx)
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; i686-NEXT: movl %edx, (%ecx)
-; i686-NEXT: movl %eax, 24(%ecx)
+; i686-NEXT: movl 108(%esp,%edx), %edx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, 8(%ecx)
-; i686-NEXT: addl $100, %esp
+; i686-NEXT: shldl %cl, %eax, %edx
+; i686-NEXT: movl 72(%ebp), %eax
+; i686-NEXT: movl %edx, 28(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 24(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 20(%eax)
+; i686-NEXT: movl %esi, 16(%eax)
+; i686-NEXT: movl %ebx, 12(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 8(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, 4(%eax)
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT: movl %ecx, (%eax)
+; i686-NEXT: leal -12(%ebp), %esp
; i686-NEXT: popl %esi
; i686-NEXT: popl %edi
; i686-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index e1466aebf422..128e2199fb56 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -8,98 +8,78 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-LABEL: shift1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: andl $-16, %esp
+; CHECK-NEXT: subl $112, %esp
+; CHECK-NEXT: movl 40(%ebp), %ecx
+; CHECK-NEXT: movl 8(%ebp), %eax
+; CHECK-NEXT: movl 12(%ebp), %edx
+; CHECK-NEXT: movl 16(%ebp), %esi
+; CHECK-NEXT: movl 32(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 28(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 24(%ebp), %edi
; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 20(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl 36(%ebp), %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT: sarl $31, %esi
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: sarl $31, %edi
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: andb $7, %al
-; CHECK-NEXT: shrb $3, %cl
-; CHECK-NEXT: movzbl %cl, %ebp
-; CHECK-NEXT: movl 32(%esp,%ebp), %esi
+; CHECK-NEXT: shrb $5, %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: movl 40(%esp,%eax,4), %edx
+; CHECK-NEXT: movl 36(%esp,%eax,4), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: notb %dl
-; CHECK-NEXT: movl 36(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 40(%esp,%ebp), %esi
+; CHECK-NEXT: shrdl %cl, %edx, %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %esi
-; CHECK-NEXT: movl 44(%esp,%ebp), %ecx
-; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; CHECK-NEXT: leal (%ecx,%ecx), %edi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %edi
-; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl 44(%esp,%eax,4), %esi
+; CHECK-NEXT: shrdl %cl, %esi, %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 48(%esp,%eax,4), %ebx
+; CHECK-NEXT: shrdl %cl, %ebx, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 52(%esp,%eax,4), %esi
+; CHECK-NEXT: shrdl %cl, %esi, %ebx
+; CHECK-NEXT: movl 56(%esp,%eax,4), %edx
+; CHECK-NEXT: shrdl %cl, %edx, %esi
+; CHECK-NEXT: movl 32(%esp,%eax,4), %edi
; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 48(%esp,%ebp), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: movl 52(%esp,%ebp), %edi
-; CHECK-NEXT: leal (%edi,%edi), %esi
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: orl %ebx, %esi
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; CHECK-NEXT: movl 28(%esp,%ebp), %edx
-; CHECK-NEXT: movl 56(%esp,%ebp), %ebx
-; CHECK-NEXT: shrdl %cl, %ebx, %edi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; CHECK-NEXT: shrdl %cl, %ebp, %edx
-; CHECK-NEXT: sarl %cl, %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl %ebx, 28(%eax)
-; CHECK-NEXT: movl %edi, 24(%eax)
-; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: movl %edx, (%eax)
-; CHECK-NEXT: movl %esi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 12(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 4(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: movl 60(%esp,%eax,4), %eax
+; CHECK-NEXT: shrdl %cl, %eax, %edx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: sarl %cl, %eax
+; CHECK-NEXT: movl 72(%ebp), %ecx
+; CHECK-NEXT: movl %eax, 28(%ecx)
+; CHECK-NEXT: movl %edx, 24(%ecx)
+; CHECK-NEXT: movl %esi, 20(%ecx)
+; CHECK-NEXT: movl %ebx, 16(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 12(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 8(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, 4(%ecx)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: movl %eax, (%ecx)
+; CHECK-NEXT: leal -12(%ebp), %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -120,42 +100,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movb %r8b, %dl
-; CHECK-X64-O0-NEXT: movb %dl, %cl
-; CHECK-X64-O0-NEXT: andb $7, %cl
+; CHECK-X64-O0-NEXT: movb %r8b, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-X64-O0-NEXT: shrb $3, %dl
+; CHECK-X64-O0-NEXT: movb %cl, %dl
+; CHECK-X64-O0-NEXT: shrb $6, %dl
; CHECK-X64-O0-NEXT: movzbl %dl, %edx
; CHECK-X64-O0-NEXT: movl %edx, %edi
-; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx
-; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8
-; CHECK-X64-O0-NEXT: movq %r8, %r9
-; CHECK-X64-O0-NEXT: shrq %cl, %r9
-; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: notb %cl
-; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi
-; CHECK-X64-O0-NEXT: movq %rsi, %r10
-; CHECK-X64-O0-NEXT: addq %r10, %r10
-; CHECK-X64-O0-NEXT: shlq %cl, %r10
+; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT: movq -72(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT: movq %r9, %rdx
+; CHECK-X64-O0-NEXT: shrdq %cl, %rsi, %rdx
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: orq %r10, %r9
-; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi
+; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rdi
; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx
+; CHECK-X64-O0-NEXT: shrdq %cl, %r9, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
+; CHECK-X64-O0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-X64-O0-NEXT: sarq %cl, %rdi
; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax)
; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax)
-; CHECK-X64-O0-NEXT: movq %rdx, (%rax)
-; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax)
+; CHECK-X64-O0-NEXT: movq %rdx, 8(%rax)
+; CHECK-X64-O0-NEXT: movq %rcx, (%rax)
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift1:
; CHECK-X64-O2: # %bb.0: # %entry
-; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
@@ -165,29 +138,23 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movl %r8d, %eax
-; CHECK-X64-O2-NEXT: andb $7, %al
-; CHECK-X64-O2-NEXT: shrb $3, %r8b
-; CHECK-X64-O2-NEXT: movzbl %r8b, %edx
-; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT: movq %rdi, %r8
-; CHECK-X64-O2-NEXT: movl %eax, %ecx
-; CHECK-X64-O2-NEXT: shrq %cl, %r8
-; CHECK-X64-O2-NEXT: notb %cl
-; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10
-; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11
-; CHECK-X64-O2-NEXT: shlq %cl, %r11
-; CHECK-X64-O2-NEXT: orq %r8, %r11
-; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx
-; CHECK-X64-O2-NEXT: movl %eax, %ecx
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10
-; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi
+; CHECK-X64-O2-NEXT: movl %r8d, %ecx
+; CHECK-X64-O2-NEXT: shrb $6, %cl
+; CHECK-X64-O2-NEXT: movzbl %cl, %edx
+; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT: movq -72(%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT: movq %r9, %r10
+; CHECK-X64-O2-NEXT: movl %r8d, %ecx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10
+; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi
+; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi
; CHECK-X64-O2-NEXT: sarq %cl, %rdx
-; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9)
-; CHECK-X64-O2-NEXT: movq %r10, 16(%r9)
-; CHECK-X64-O2-NEXT: movq %rsi, (%r9)
-; CHECK-X64-O2-NEXT: movq %r11, 8(%r9)
+; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
; CHECK-X64-O2-NEXT: retq
entry:
%0 = ashr i256 %x, %a
@@ -199,11 +166,13 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-LABEL: shift2:
; CHECK: # %bb.0:
; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $92, %esp
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: andl $-16, %esp
+; CHECK-NEXT: subl $112, %esp
+; CHECK-NEXT: movl 12(%ebp), %ecx
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -220,68 +189,54 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movb %al, %ch
-; CHECK-NEXT: andb $7, %ch
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrb $3, %al
+; CHECK-NEXT: andb $28, %al
; CHECK-NEXT: negb %al
; CHECK-NEXT: movsbl %al, %eax
-; CHECK-NEXT: movl 68(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: notb %cl
-; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; CHECK-NEXT: movl 64(%esp,%eax), %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebp
-; CHECK-NEXT: shrl %cl, %ebp
-; CHECK-NEXT: orl %edx, %ebp
-; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 76(%esp,%eax), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %edx
-; CHECK-NEXT: movl 72(%esp,%eax), %ebx
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shrl %ebx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %ebx
-; CHECK-NEXT: orl %edx, %ebx
-; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl 68(%esp,%eax), %esi
; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl 80(%esp,%eax), %edi
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-NEXT: shrl %cl, %edx
-; CHECK-NEXT: orl %esi, %edx
-; CHECK-NEXT: movb %ch, %cl
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: movl 72(%esp,%eax), %edx
+; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: shldl %cl, %esi, %edi
-; CHECK-NEXT: movl 60(%esp,%eax), %ebp
-; CHECK-NEXT: movl 88(%esp,%eax), %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %eax, %esi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 76(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, %edi
+; CHECK-NEXT: shldl %cl, %edx, %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 80(%esp,%eax), %edx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shldl %cl, %esi, %edi
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 84(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: shldl %cl, %edx, %ebx
+; CHECK-NEXT: movl 88(%esp,%eax), %edi
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: shldl %cl, %esi, %edx
+; CHECK-NEXT: movl 64(%esp,%eax), %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 92(%esp,%eax), %esi
+; CHECK-NEXT: shldl %cl, %edi, %esi
+; CHECK-NEXT: movl 8(%ebp), %eax
; CHECK-NEXT: movl %esi, 28(%eax)
-; CHECK-NEXT: movl %edi, 20(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; CHECK-NEXT: movl %esi, 12(%eax)
-; CHECK-NEXT: movl %ebp, %esi
-; CHECK-NEXT: shll %cl, %esi
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; CHECK-NEXT: shldl %cl, %ebp, %edi
-; CHECK-NEXT: movl %edi, 4(%eax)
-; CHECK-NEXT: movl %esi, (%eax)
; CHECK-NEXT: movl %edx, 24(%eax)
-; CHECK-NEXT: movl %ebx, 16(%eax)
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: movl %ecx, 8(%eax)
-; CHECK-NEXT: addl $92, %esp
+; CHECK-NEXT: movl %ebx, 20(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 16(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 12(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT: movl %edx, 8(%eax)
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: shll %cl, %edx
+; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: shldl %cl, %edi, %esi
+; CHECK-NEXT: movl %esi, 4(%eax)
+; CHECK-NEXT: movl %edx, (%eax)
+; CHECK-NEXT: leal -12(%ebp), %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
@@ -299,77 +254,64 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movb %sil, %dl
-; CHECK-X64-O0-NEXT: movb %dl, %cl
-; CHECK-X64-O0-NEXT: andb $7, %cl
+; CHECK-X64-O0-NEXT: movb %sil, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-X64-O0-NEXT: movb %cl, %dl
; CHECK-X64-O0-NEXT: shrb $3, %dl
+; CHECK-X64-O0-NEXT: andb $24, %dl
; CHECK-X64-O0-NEXT: negb %dl
-; CHECK-X64-O0-NEXT: movsbq %dl, %rdx
-; CHECK-X64-O0-NEXT: movq -16(%rsp,%rdx), %rsi
-; CHECK-X64-O0-NEXT: movq %rsi, %r10
-; CHECK-X64-O0-NEXT: shlq %cl, %r10
+; CHECK-X64-O0-NEXT: movsbq %dl, %r8
+; CHECK-X64-O0-NEXT: movq -40(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT: movq %r10, %rsi
+; CHECK-X64-O0-NEXT: shldq %cl, %rdx, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: notb %cl
-; CHECK-X64-O0-NEXT: movq -32(%rsp,%rdx), %r9
-; CHECK-X64-O0-NEXT: movq -24(%rsp,%rdx), %r8
-; CHECK-X64-O0-NEXT: movq %r8, %r11
-; CHECK-X64-O0-NEXT: shrq %r11
-; CHECK-X64-O0-NEXT: shrq %cl, %r11
-; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: orq %r11, %r10
-; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-X64-O0-NEXT: movq -8(%rsp,%rdx), %rdx
-; CHECK-X64-O0-NEXT: shldq %cl, %rsi, %rdx
+; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT: shldq %cl, %r10, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq %r9, %rsi
-; CHECK-X64-O0-NEXT: shlq %cl, %rsi
+; CHECK-X64-O0-NEXT: movq %r9, %r10
+; CHECK-X64-O0-NEXT: shlq %cl, %r10
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: shldq %cl, %r9, %r8
+; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-X64-O0-NEXT: shldq %cl, %r9, %rdx
; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-X64-O0-NEXT: movq %r8, 8(%rdi)
-; CHECK-X64-O0-NEXT: movq %rsi, (%rdi)
-; CHECK-X64-O0-NEXT: movq %rdx, 24(%rdi)
-; CHECK-X64-O0-NEXT: movq %rcx, 16(%rdi)
+; CHECK-X64-O0-NEXT: movq %r8, 24(%rdi)
+; CHECK-X64-O0-NEXT: movq %rsi, 16(%rdi)
+; CHECK-X64-O0-NEXT: movq %rdx, 8(%rdi)
+; CHECK-X64-O0-NEXT: movq %rcx, (%rdi)
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift2:
; CHECK-X64-O2: # %bb.0:
+; CHECK-X64-O2-NEXT: movq %rsi, %rcx
; CHECK-X64-O2-NEXT: movq %rdi, %rax
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: xorps %xmm0, %xmm0
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movl %esi, %edx
-; CHECK-X64-O2-NEXT: andb $7, %dl
-; CHECK-X64-O2-NEXT: shrb $3, %sil
-; CHECK-X64-O2-NEXT: negb %sil
-; CHECK-X64-O2-NEXT: movsbq %sil, %rsi
-; CHECK-X64-O2-NEXT: movq -16(%rsp,%rsi), %rdi
-; CHECK-X64-O2-NEXT: movq %rdi, %r8
-; CHECK-X64-O2-NEXT: movl %edx, %ecx
+; CHECK-X64-O2-NEXT: movl %ecx, %edx
+; CHECK-X64-O2-NEXT: shrb $3, %dl
+; CHECK-X64-O2-NEXT: andb $24, %dl
+; CHECK-X64-O2-NEXT: negb %dl
+; CHECK-X64-O2-NEXT: movsbq %dl, %rdx
+; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT: movq %r8, %r9
+; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9
+; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx
+; CHECK-X64-O2-NEXT: movq %rsi, %r8
; CHECK-X64-O2-NEXT: shlq %cl, %r8
-; CHECK-X64-O2-NEXT: notb %cl
-; CHECK-X64-O2-NEXT: movq -32(%rsp,%rsi), %r9
-; CHECK-X64-O2-NEXT: movq -24(%rsp,%rsi), %r10
-; CHECK-X64-O2-NEXT: movq %r10, %r11
-; CHECK-X64-O2-NEXT: shrq %r11
-; CHECK-X64-O2-NEXT: shrq %cl, %r11
-; CHECK-X64-O2-NEXT: orq %r8, %r11
-; CHECK-X64-O2-NEXT: movq -8(%rsp,%rsi), %rsi
-; CHECK-X64-O2-NEXT: movl %edx, %ecx
-; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %rsi
-; CHECK-X64-O2-NEXT: movq %r9, %rdi
-; CHECK-X64-O2-NEXT: shlq %cl, %rdi
-; CHECK-X64-O2-NEXT: shldq %cl, %r9, %r10
-; CHECK-X64-O2-NEXT: movq %rsi, 24(%rax)
-; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
-; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
-; CHECK-X64-O2-NEXT: movq %r11, 16(%rax)
+; CHECK-X64-O2-NEXT: # kill: def $cl killed $cl killed $rcx
+; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %rdi
+; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax)
+; CHECK-X64-O2-NEXT: movq %r9, 16(%rax)
+; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax)
+; CHECK-X64-O2-NEXT: movq %r8, (%rax)
; CHECK-X64-O2-NEXT: retq
{
%b = shl i256 1, %c ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index e5affd86312e..277525796824 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -646,7 +646,869 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_16bytes:
+; FALLBACK16-LABEL: lshr_16bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $60, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edx
+; FALLBACK16-NEXT: movl 4(%ecx), %esi
+; FALLBACK16-NEXT: movl 8(%ecx), %edi
+; FALLBACK16-NEXT: movl 12(%ecx), %ecx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movb %ah, %al
+; FALLBACK16-NEXT: shlb $3, %al
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $12, %ah
+; FALLBACK16-NEXT: movzbl %ah, %ebp
+; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl %ebx, 12(%edx)
+; FALLBACK16-NEXT: movl %ebp, 8(%edx)
+; FALLBACK16-NEXT: movl %esi, (%edx)
+; FALLBACK16-NEXT: movl %edi, 4(%edx)
+; FALLBACK16-NEXT: addl $60, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_16bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $44, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT: movl (%edx), %esi
+; FALLBACK17-NEXT: movl 4(%edx), %edi
+; FALLBACK17-NEXT: movl 8(%edx), %ebx
+; FALLBACK17-NEXT: movl 12(%edx), %edx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, (%esp)
+; FALLBACK17-NEXT: andb $12, %ch
+; FALLBACK17-NEXT: movzbl %ch, %ebx
+; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi
+; FALLBACK17-NEXT: movl (%esp,%ebx), %edx
+; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp
+; FALLBACK17-NEXT: movl %ebp, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx
+; FALLBACK17-NEXT: shrl %cl, %ebx
+; FALLBACK17-NEXT: movl %esi, 8(%eax)
+; FALLBACK17-NEXT: movl %ebx, 12(%eax)
+; FALLBACK17-NEXT: movl %edx, (%eax)
+; FALLBACK17-NEXT: movl %edi, 4(%eax)
+; FALLBACK17-NEXT: addl $44, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_16bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $44, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl (%ecx), %edx
+; FALLBACK18-NEXT: movl 4(%ecx), %esi
+; FALLBACK18-NEXT: movl 8(%ecx), %edi
+; FALLBACK18-NEXT: movl 12(%ecx), %ecx
+; FALLBACK18-NEXT: movzbl (%eax), %ebx
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, (%esp)
+; FALLBACK18-NEXT: andb $12, %bl
+; FALLBACK18-NEXT: movzbl %bl, %esi
+; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi
+; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx
+; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %ebp, %ecx
+; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl %ebp, %edi
+; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi
+; FALLBACK18-NEXT: shrxl %eax, %esi, %eax
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %edx
+; FALLBACK18-NEXT: orl %ebx, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT: movl %eax, 12(%esi)
+; FALLBACK18-NEXT: movl %edx, 8(%esi)
+; FALLBACK18-NEXT: movl %edi, (%esi)
+; FALLBACK18-NEXT: movl %ecx, 4(%esi)
+; FALLBACK18-NEXT: addl $44, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_16bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $44, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl (%edx), %esi
+; FALLBACK19-NEXT: movl 4(%edx), %edi
+; FALLBACK19-NEXT: movl 8(%edx), %ebx
+; FALLBACK19-NEXT: movl 12(%edx), %edx
+; FALLBACK19-NEXT: movzbl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, (%esp)
+; FALLBACK19-NEXT: andb $12, %al
+; FALLBACK19-NEXT: movzbl %al, %eax
+; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx
+; FALLBACK19-NEXT: movl (%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %edi
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi
+; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl %ebx, 8(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, (%ebp)
+; FALLBACK19-NEXT: movl %edi, 4(%ebp)
+; FALLBACK19-NEXT: addl $44, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_16bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $60, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm1, %xmm1
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $12, %cl
+; FALLBACK20-NEXT: movzbl %cl, %edi
+; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %esi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl %edi, 12(%edx)
+; FALLBACK20-NEXT: movl %ebx, 4(%edx)
+; FALLBACK20-NEXT: movl %ebp, 8(%edx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, (%edx)
+; FALLBACK20-NEXT: addl $60, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_16bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $44, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm1, %xmm1
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, (%esp)
+; FALLBACK21-NEXT: andb $12, %dl
+; FALLBACK21-NEXT: movzbl %dl, %ebx
+; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK21-NEXT: movl %ebp, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK21-NEXT: movl %eax, %ebx
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK21-NEXT: movl %edi, 8(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: shrl %cl, %edx
+; FALLBACK21-NEXT: movl %edx, 12(%ebp)
+; FALLBACK21-NEXT: movl %esi, (%ebp)
+; FALLBACK21-NEXT: addl $44, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_16bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $44, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm1, %xmm1
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, (%esp)
+; FALLBACK22-NEXT: andb $12, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: notb %cl
+; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK22-NEXT: orl %ebx, %edx
+; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK22-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT: orl %ebx, %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl %eax, 12(%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %edi, 8(%esi)
+; FALLBACK22-NEXT: movl %edx, (%esi)
+; FALLBACK22-NEXT: addl $44, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_16bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $44, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm1, %xmm1
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, (%esp)
+; FALLBACK23-NEXT: andb $12, %dl
+; FALLBACK23-NEXT: movzbl %dl, %ebx
+; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %ebx
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK23-NEXT: movl %edi, 8(%ebp)
+; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK23-NEXT: movl %edx, 12(%ebp)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, (%ebp)
+; FALLBACK23-NEXT: addl $44, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_16bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $60, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $12, %cl
+; FALLBACK24-NEXT: movzbl %cl, %edi
+; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %esi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl %edi, 12(%edx)
+; FALLBACK24-NEXT: movl %ebx, 4(%edx)
+; FALLBACK24-NEXT: movl %ebp, 8(%edx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, (%edx)
+; FALLBACK24-NEXT: addl $60, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_16bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $44, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK25-NEXT: andb $12, %dl
+; FALLBACK25-NEXT: movzbl %dl, %ebx
+; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK25-NEXT: movl %ebp, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK25-NEXT: movl %eax, %ebx
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK25-NEXT: movl %edi, 8(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: shrl %cl, %edx
+; FALLBACK25-NEXT: movl %edx, 12(%ebp)
+; FALLBACK25-NEXT: movl %esi, (%ebp)
+; FALLBACK25-NEXT: addl $44, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_16bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $44, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK26-NEXT: andb $12, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: notb %cl
+; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK26-NEXT: orl %ebx, %edx
+; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK26-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT: orl %ebx, %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT: orl %ebp, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl %eax, 12(%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %edi, 8(%esi)
+; FALLBACK26-NEXT: movl %edx, (%esi)
+; FALLBACK26-NEXT: addl $44, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_16bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $44, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK27-NEXT: andb $12, %dl
+; FALLBACK27-NEXT: movzbl %dl, %ebx
+; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %ebx
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK27-NEXT: movl %edi, 8(%ebp)
+; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK27-NEXT: movl %edx, 12(%ebp)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, (%ebp)
+; FALLBACK27-NEXT: addl $44, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_16bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $60, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $12, %cl
+; FALLBACK28-NEXT: movzbl %cl, %edi
+; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %esi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl %edi, 12(%edx)
+; FALLBACK28-NEXT: movl %ebx, 4(%edx)
+; FALLBACK28-NEXT: movl %ebp, 8(%edx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, (%edx)
+; FALLBACK28-NEXT: addl $60, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_16bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $44, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK29-NEXT: andb $12, %dl
+; FALLBACK29-NEXT: movzbl %dl, %ebx
+; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK29-NEXT: movl %ebp, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK29-NEXT: movl %eax, %ebx
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK29-NEXT: movl %edi, 8(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: shrl %cl, %edx
+; FALLBACK29-NEXT: movl %edx, 12(%ebp)
+; FALLBACK29-NEXT: movl %esi, (%ebp)
+; FALLBACK29-NEXT: addl $44, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_16bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $44, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK30-NEXT: andb $12, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: notb %cl
+; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp
+; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx
+; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx
+; FALLBACK30-NEXT: orl %ebx, %edx
+; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp
+; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi
+; FALLBACK30-NEXT: shrxl %eax, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT: orl %ebx, %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT: orl %ebp, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl %eax, 12(%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %edi, 8(%esi)
+; FALLBACK30-NEXT: movl %edx, (%esi)
+; FALLBACK30-NEXT: addl $44, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_16bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $44, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, (%esp)
+; FALLBACK31-NEXT: andb $12, %dl
+; FALLBACK31-NEXT: movzbl %dl, %ebx
+; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl (%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %ebx
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %ebx, 4(%ebp)
+; FALLBACK31-NEXT: movl %edi, 8(%ebp)
+; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx
+; FALLBACK31-NEXT: movl %edx, 12(%ebp)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, (%ebp)
+; FALLBACK31-NEXT: addl $44, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = lshr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %rdi, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %edi, %edi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X86-SSE2-LABEL: lshr_16bytes_dwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
@@ -660,19 +1522,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 8(%edx), %ebx
; X86-SSE2-NEXT: movl 12(%edx), %edx
; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, (%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $15, %ecx
-; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
+; X86-SSE2-NEXT: andl $3, %ecx
+; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx
+; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi
+; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi
+; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx
; X86-SSE2-NEXT: movl %ecx, 8(%eax)
; X86-SSE2-NEXT: movl %edi, 12(%eax)
; X86-SSE2-NEXT: movl %edx, (%eax)
@@ -683,46 +1543,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: lshr_16bytes:
+; X86-SSE42-LABEL: lshr_16bytes_dwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $32, %esp
+; X86-SSE42-NEXT: subl $44, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT: movups (%edx), %xmm0
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm1, %xmm1
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $15, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
+; X86-SSE42-NEXT: addl $44, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: lshr_16bytes:
+; X86-AVX-LABEL: lshr_16bytes_dwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $32, %esp
+; X86-AVX-NEXT: subl $44, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: vmovups (%edx), %xmm0
; X86-AVX-NEXT: movzbl (%ecx), %ecx
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm0, (%esp)
-; X86-AVX-NEXT: andl $15, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
+; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
+; X86-AVX-NEXT: addl $44, %esp
; X86-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i128 %dwordOff, 5
%res = lshr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes:
; X64-NO-SHLD-NO-BMI2: # %bb.0:
@@ -800,7 +1661,877 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: shl_16bytes:
+; FALLBACK16-LABEL: shl_16bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $60, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %ebx
+; FALLBACK16-NEXT: movl 4(%ecx), %esi
+; FALLBACK16-NEXT: movl 8(%ecx), %edi
+; FALLBACK16-NEXT: movl 12(%ecx), %ecx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movb %ah, %dh
+; FALLBACK16-NEXT: shlb $3, %dh
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $12, %ah
+; FALLBACK16-NEXT: negb %ah
+; FALLBACK16-NEXT: movsbl %ah, %ebp
+; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movb %dh, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: shrl %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %edi, %esi
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 8(%eax)
+; FALLBACK16-NEXT: movl %ebp, 12(%eax)
+; FALLBACK16-NEXT: movl %ebx, 4(%eax)
+; FALLBACK16-NEXT: addl $60, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_16bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $32, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK17-NEXT: movl (%edx), %esi
+; FALLBACK17-NEXT: movl 4(%edx), %edi
+; FALLBACK17-NEXT: movl 8(%edx), %ebx
+; FALLBACK17-NEXT: movl 12(%edx), %edx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, (%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $12, %ch
+; FALLBACK17-NEXT: negb %ch
+; FALLBACK17-NEXT: movsbl %ch, %edi
+; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK17-NEXT: shldl %cl, %edi, %esi
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: movl %esi, 8(%eax)
+; FALLBACK17-NEXT: movl %edx, 12(%eax)
+; FALLBACK17-NEXT: movl %ebx, (%eax)
+; FALLBACK17-NEXT: movl %edi, 4(%eax)
+; FALLBACK17-NEXT: addl $32, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_16bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $44, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl (%ecx), %edx
+; FALLBACK18-NEXT: movl 4(%ecx), %esi
+; FALLBACK18-NEXT: movl 8(%ecx), %edi
+; FALLBACK18-NEXT: movl 12(%ecx), %ecx
+; FALLBACK18-NEXT: movzbl (%eax), %eax
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: shlb $3, %bl
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, (%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $12, %al
+; FALLBACK18-NEXT: negb %al
+; FALLBACK18-NEXT: movsbl %al, %edx
+; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi
+; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: notb %al
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %eax, %edi, %edi
+; FALLBACK18-NEXT: orl %esi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi
+; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx
+; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx
+; FALLBACK18-NEXT: shrl %edx
+; FALLBACK18-NEXT: shrxl %eax, %edx, %edx
+; FALLBACK18-NEXT: orl %esi, %edx
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl %ebp, (%ecx)
+; FALLBACK18-NEXT: movl %eax, 8(%ecx)
+; FALLBACK18-NEXT: movl %edx, 12(%ecx)
+; FALLBACK18-NEXT: movl %edi, 4(%ecx)
+; FALLBACK18-NEXT: addl $44, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_16bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $44, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK19-NEXT: movl (%edx), %esi
+; FALLBACK19-NEXT: movl 4(%edx), %edi
+; FALLBACK19-NEXT: movl 8(%edx), %ebx
+; FALLBACK19-NEXT: movl 12(%edx), %edx
+; FALLBACK19-NEXT: movzbl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, (%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $12, %al
+; FALLBACK19-NEXT: negb %al
+; FALLBACK19-NEXT: movsbl %al, %eax
+; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx
+; FALLBACK19-NEXT: shldl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax
+; FALLBACK19-NEXT: shldl %cl, %eax, %esi
+; FALLBACK19-NEXT: shldl %cl, %edi, %eax
+; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx
+; FALLBACK19-NEXT: movl %esi, 8(%ebp)
+; FALLBACK19-NEXT: movl %edx, 12(%ebp)
+; FALLBACK19-NEXT: movl %ecx, (%ebp)
+; FALLBACK19-NEXT: movl %eax, 4(%ebp)
+; FALLBACK19-NEXT: addl $44, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_16bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $60, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm1, %xmm1
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $12, %cl
+; FALLBACK20-NEXT: negb %cl
+; FALLBACK20-NEXT: movsbl %cl, %edi
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebp, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: movl %eax, (%edx)
+; FALLBACK20-NEXT: movl %ebp, 4(%edx)
+; FALLBACK20-NEXT: movl %edi, 8(%edx)
+; FALLBACK20-NEXT: movl %esi, 12(%edx)
+; FALLBACK20-NEXT: addl $60, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_16bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $44, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movups (%edx), %xmm0
+; FALLBACK21-NEXT: movzbl (%ecx), %edx
+; FALLBACK21-NEXT: movl %edx, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm1, %xmm1
+; FALLBACK21-NEXT: movaps %xmm1, (%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $12, %dl
+; FALLBACK21-NEXT: negb %dl
+; FALLBACK21-NEXT: movsbl %dl, %edi
+; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK21-NEXT: shldl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %ebx, %ebp
+; FALLBACK21-NEXT: shll %cl, %ebp
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT: movl %edi, 4(%eax)
+; FALLBACK21-NEXT: movl %esi, 8(%eax)
+; FALLBACK21-NEXT: movl %edx, 12(%eax)
+; FALLBACK21-NEXT: movl %ebp, (%eax)
+; FALLBACK21-NEXT: addl $44, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_16bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $44, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm1, %xmm1
+; FALLBACK22-NEXT: movaps %xmm1, (%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $12, %cl
+; FALLBACK22-NEXT: negb %cl
+; FALLBACK22-NEXT: movsbl %cl, %ecx
+; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK22-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK22-NEXT: movl %eax, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %edx
+; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT: orl %esi, %edx
+; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl %esi, %ebp
+; FALLBACK22-NEXT: shrl %ebp
+; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %edi, %ebp
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl %eax, (%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %ebp, 8(%esi)
+; FALLBACK22-NEXT: movl %edx, 12(%esi)
+; FALLBACK22-NEXT: addl $44, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_16bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $44, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movups (%edx), %xmm0
+; FALLBACK23-NEXT: movzbl (%ecx), %edx
+; FALLBACK23-NEXT: movl %edx, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm1, %xmm1
+; FALLBACK23-NEXT: movaps %xmm1, (%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $12, %dl
+; FALLBACK23-NEXT: negb %dl
+; FALLBACK23-NEXT: movsbl %dl, %edi
+; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK23-NEXT: shldl %cl, %edi, %esi
+; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK23-NEXT: movl %edi, 4(%eax)
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl %edx, 12(%eax)
+; FALLBACK23-NEXT: movl %ebp, (%eax)
+; FALLBACK23-NEXT: addl $44, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_16bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $60, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $12, %cl
+; FALLBACK24-NEXT: negb %cl
+; FALLBACK24-NEXT: movsbl %cl, %edi
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebp, %edi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: movl %eax, (%edx)
+; FALLBACK24-NEXT: movl %ebp, 4(%edx)
+; FALLBACK24-NEXT: movl %edi, 8(%edx)
+; FALLBACK24-NEXT: movl %esi, 12(%edx)
+; FALLBACK24-NEXT: addl $60, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_16bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $44, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: vmovups (%edx), %xmm0
+; FALLBACK25-NEXT: movzbl (%ecx), %edx
+; FALLBACK25-NEXT: movl %edx, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $12, %dl
+; FALLBACK25-NEXT: negb %dl
+; FALLBACK25-NEXT: movsbl %dl, %edi
+; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK25-NEXT: shldl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %ebx, %ebp
+; FALLBACK25-NEXT: shll %cl, %ebp
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT: movl %edi, 4(%eax)
+; FALLBACK25-NEXT: movl %esi, 8(%eax)
+; FALLBACK25-NEXT: movl %edx, 12(%eax)
+; FALLBACK25-NEXT: movl %ebp, (%eax)
+; FALLBACK25-NEXT: addl $44, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_16bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $44, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $12, %cl
+; FALLBACK26-NEXT: negb %cl
+; FALLBACK26-NEXT: movsbl %cl, %ecx
+; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK26-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK26-NEXT: movl %eax, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %edx
+; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT: orl %esi, %edx
+; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl %esi, %ebp
+; FALLBACK26-NEXT: shrl %ebp
+; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %edi, %ebp
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl %eax, (%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %ebp, 8(%esi)
+; FALLBACK26-NEXT: movl %edx, 12(%esi)
+; FALLBACK26-NEXT: addl $44, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_16bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $44, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: vmovups (%edx), %xmm0
+; FALLBACK27-NEXT: movzbl (%ecx), %edx
+; FALLBACK27-NEXT: movl %edx, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $12, %dl
+; FALLBACK27-NEXT: negb %dl
+; FALLBACK27-NEXT: movsbl %dl, %edi
+; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK27-NEXT: shldl %cl, %edi, %esi
+; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK27-NEXT: movl %edi, 4(%eax)
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl %edx, 12(%eax)
+; FALLBACK27-NEXT: movl %ebp, (%eax)
+; FALLBACK27-NEXT: addl $44, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_16bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $60, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $12, %cl
+; FALLBACK28-NEXT: negb %cl
+; FALLBACK28-NEXT: movsbl %cl, %edi
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebp, %edi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: movl %eax, (%edx)
+; FALLBACK28-NEXT: movl %ebp, 4(%edx)
+; FALLBACK28-NEXT: movl %edi, 8(%edx)
+; FALLBACK28-NEXT: movl %esi, 12(%edx)
+; FALLBACK28-NEXT: addl $60, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_16bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $44, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: vmovups (%edx), %xmm0
+; FALLBACK29-NEXT: movzbl (%ecx), %edx
+; FALLBACK29-NEXT: movl %edx, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $12, %dl
+; FALLBACK29-NEXT: negb %dl
+; FALLBACK29-NEXT: movsbl %dl, %edi
+; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK29-NEXT: shldl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %ebx, %ebp
+; FALLBACK29-NEXT: shll %cl, %ebp
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT: movl %edi, 4(%eax)
+; FALLBACK29-NEXT: movl %esi, 8(%eax)
+; FALLBACK29-NEXT: movl %edx, 12(%eax)
+; FALLBACK29-NEXT: movl %ebp, (%eax)
+; FALLBACK29-NEXT: addl $44, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_16bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $44, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $12, %cl
+; FALLBACK30-NEXT: negb %cl
+; FALLBACK30-NEXT: movsbl %cl, %ecx
+; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx
+; FALLBACK30-NEXT: shlxl %eax, %edx, %edi
+; FALLBACK30-NEXT: movl %eax, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %edx
+; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT: orl %esi, %edx
+; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl %esi, %ebp
+; FALLBACK30-NEXT: shrl %ebp
+; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %edi, %ebp
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl %eax, (%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %ebp, 8(%esi)
+; FALLBACK30-NEXT: movl %edx, 12(%esi)
+; FALLBACK30-NEXT: addl $44, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_16bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $44, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: vmovups (%edx), %xmm0
+; FALLBACK31-NEXT: movzbl (%ecx), %edx
+; FALLBACK31-NEXT: movl %edx, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovaps %xmm1, (%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $12, %dl
+; FALLBACK31-NEXT: negb %dl
+; FALLBACK31-NEXT: movsbl %dl, %edi
+; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi
+; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx
+; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi
+; FALLBACK31-NEXT: shldl %cl, %edi, %esi
+; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK31-NEXT: movl %edi, 4(%eax)
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl %edx, 12(%eax)
+; FALLBACK31-NEXT: movl %ebp, (%eax)
+; FALLBACK31-NEXT: addl $44, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = shl i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: orq %rdi, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rcx
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %eax, %eax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, 8(%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrq %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rdi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shldq %cl, %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X86-SSE2-LABEL: shl_16bytes_dwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
@@ -814,15 +2545,14 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 8(%edx), %ebx
; X86-SSE2-NEXT: movl 12(%edx), %edx
; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, (%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, (%esp)
-; X86-SSE2-NEXT: andb $15, %cl
+; X86-SSE2-NEXT: shlb $2, %cl
+; X86-SSE2-NEXT: andb $12, %cl
; X86-SSE2-NEXT: negb %cl
; X86-SSE2-NEXT: movsbl %cl, %ecx
; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx
@@ -839,50 +2569,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: shl_16bytes:
+; X86-SSE42-LABEL: shl_16bytes_dwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $32, %esp
+; X86-SSE42-NEXT: subl $44, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT: movups (%edx), %xmm0
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm1, %xmm1
-; X86-SSE42-NEXT: movups %xmm1, (%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andb $15, %cl
+; X86-SSE42-NEXT: movaps %xmm1, (%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shlb $2, %cl
+; X86-SSE42-NEXT: andb $12, %cl
; X86-SSE42-NEXT: negb %cl
; X86-SSE42-NEXT: movsbl %cl, %ecx
; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm0
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $32, %esp
+; X86-SSE42-NEXT: addl $44, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: shl_16bytes:
+; X86-AVX-LABEL: shl_16bytes_dwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $32, %esp
+; X86-AVX-NEXT: subl $44, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-AVX-NEXT: vmovups (%edx), %xmm0
; X86-AVX-NEXT: movzbl (%ecx), %ecx
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX-NEXT: vmovups %xmm1, (%esp)
-; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andb $15, %cl
+; X86-AVX-NEXT: vmovaps %xmm1, (%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: shlb $2, %cl
+; X86-AVX-NEXT: andb $12, %cl
; X86-AVX-NEXT: negb %cl
; X86-AVX-NEXT: movsbl %cl, %ecx
; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm0
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $32, %esp
+; X86-AVX-NEXT: addl $44, %esp
; X86-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i128 %dwordOff, 5
%res = shl i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
; X64-NO-SHLD-NO-BMI2: # %bb.0:
@@ -960,7 +2693,312 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-NO-BMI2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-NEXT: movb %ah, %al
+; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-NEXT: andb $12, %ah
+; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %ah, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, (%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %ch
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %ch, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%ebx), %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebp, %edx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %al, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl
+ %src = load i128, ptr %src.ptr, align 1
+ %byteOff = load i128, ptr %byteOff.ptr, align 1
+ %bitOff = shl i128 %byteOff, 3
+ %res = ashr i128 %src, %bitOff
+ store i128 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
+; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
+; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi
+; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
+; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rax, %rsi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: sarq $63, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
+;
+; X86-SSE2-LABEL: ashr_16bytes_dwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
@@ -983,11 +3021,11 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $15, %ecx
-; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
-; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
-; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
-; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
+; X86-SSE2-NEXT: andl $3, %ecx
+; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx
+; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi
+; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi
+; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx
; X86-SSE2-NEXT: movl %ecx, 8(%eax)
; X86-SSE2-NEXT: movl %edi, 12(%eax)
; X86-SSE2-NEXT: movl %edx, (%eax)
@@ -998,7 +3036,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: ashr_16bytes:
+; X86-SSE42-LABEL: ashr_16bytes_dwordOff:
; X86-SSE42: # %bb.0:
; X86-SSE42-NEXT: pushl %ebx
; X86-SSE42-NEXT: pushl %edi
@@ -1021,8 +3059,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $15, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
; X86-SSE42-NEXT: movups %xmm0, (%eax)
; X86-SSE42-NEXT: addl $32, %esp
; X86-SSE42-NEXT: popl %esi
@@ -1030,7 +3068,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_16bytes:
+; X86-AVX-LABEL: ashr_16bytes_dwordOff:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: pushl %ebx
; X86-AVX-NEXT: pushl %edi
@@ -1053,8 +3091,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $15, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
; X86-AVX-NEXT: addl $32, %esp
; X86-AVX-NEXT: popl %esi
@@ -1062,84 +3100,2731 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: popl %ebx
; X86-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
- %byteOff = load i128, ptr %byteOff.ptr, align 1
- %bitOff = shl i128 %byteOff, 3
+ %dwordOff = load i128, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i128 %dwordOff, 5
%res = ashr i128 %src, %bitOff
store i128 %res, ptr %dst, align 1
ret void
}
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_32bytes:
+; FALLBACK0-LABEL: lshr_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: movzbl %cl, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: movzbl %al, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shrq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: movzbl %cl, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: movzbl %al, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: movzbl %cl, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: movzbl %al, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shrq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: movzbl %cl, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: movzbl %al, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: movzbl %cl, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: movzbl %al, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shrq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: movzbl %cl, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: movzbl %al, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: lshr_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT: movl (%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ebp), %edi
+; FALLBACK16-NEXT: movl 16(%ebp), %ebx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movl 20(%ebp), %esi
+; FALLBACK16-NEXT: movl 24(%ebp), %ecx
+; FALLBACK16-NEXT: movl 28(%ebp), %ebp
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %ah, %dh
+; FALLBACK16-NEXT: shlb $3, %dh
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %ah
+; FALLBACK16-NEXT: movzbl %ah, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi
+; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK16-NEXT: movl %eax, %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movb %dh, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl %eax, %ebx
+; FALLBACK16-NEXT: addl %eax, %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %esi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %esi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi
+; FALLBACK16-NEXT: movl %edi, %ebx
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %eax
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %dh, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 28(%eax)
+; FALLBACK16-NEXT: movl %esi, 24(%eax)
+; FALLBACK16-NEXT: movl %edi, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl %ebp, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl (%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ebp), %esi
+; FALLBACK17-NEXT: movl 12(%ebp), %edi
+; FALLBACK17-NEXT: movl 16(%ebp), %ebx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movl 20(%ebp), %edx
+; FALLBACK17-NEXT: movl 24(%ebp), %eax
+; FALLBACK17-NEXT: movl 28(%ebp), %ebp
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %ch
+; FALLBACK17-NEXT: movzbl %ch, %ebp
+; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 24(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT: shrl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK17-NEXT: movl %edi, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %esi, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %esi
+; FALLBACK18-NEXT: movl 12(%eax), %edi
+; FALLBACK18-NEXT: movl 16(%eax), %ebp
+; FALLBACK18-NEXT: movzbl (%ebx), %ebx
+; FALLBACK18-NEXT: movl 20(%eax), %edx
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl 28(%eax), %eax
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %bl
+; FALLBACK18-NEXT: movzbl %bl, %edi
+; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, %esi, %edx
+; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %ebx, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi
+; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK18-NEXT: movl %ecx, %eax
+; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ecx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT: orl %esi, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebx, 28(%eax)
+; FALLBACK18-NEXT: movl %edi, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 16(%eax)
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %esi
+; FALLBACK19-NEXT: movl 12(%ecx), %edi
+; FALLBACK19-NEXT: movl 16(%ecx), %ebp
+; FALLBACK19-NEXT: movzbl (%ebx), %ebx
+; FALLBACK19-NEXT: movl 20(%ecx), %edx
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl 28(%ecx), %ecx
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %bl
+; FALLBACK19-NEXT: movzbl %bl, %ebp
+; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %esi, %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl %edx, %esi
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK19-NEXT: movl %esi, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, (%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ebp)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movl %ecx, %eax
+; FALLBACK20-NEXT: shlb $3, %al
+; FALLBACK20-NEXT: xorps %xmm2, %xmm2
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %cl
+; FALLBACK20-NEXT: movzbl %cl, %ecx
+; FALLBACK20-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK20-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK20-NEXT: movl %ecx, %edi
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %esi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %ebx
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 28(%eax)
+; FALLBACK20-NEXT: movl %esi, 4(%eax)
+; FALLBACK20-NEXT: movl %edi, 24(%eax)
+; FALLBACK20-NEXT: movl %ebp, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $108, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm2, %xmm2
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: movzbl %al, %ebp
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl %edi, %esi
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %esi, 4(%ebp)
+; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: shrl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %edx, (%ebp)
+; FALLBACK21-NEXT: addl $108, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %edx
+; FALLBACK22-NEXT: shlb $3, %dl
+; FALLBACK22-NEXT: xorps %xmm2, %xmm2
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %edx, %eax
+; FALLBACK22-NEXT: notb %al
+; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT: movl %eax, %ebp
+; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK22-NEXT: movl %ebp, %ecx
+; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %ebx, %ebx
+; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %ebp, %ebx
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: movl %ecx, %edx
+; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT: orl %ebp, %edi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT: movl %eax, 28(%edx)
+; FALLBACK22-NEXT: movl %ecx, 4(%edx)
+; FALLBACK22-NEXT: movl %edi, 24(%edx)
+; FALLBACK22-NEXT: movl %ebx, 16(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 8(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, (%edx)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $108, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm2, %xmm2
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: movzbl %al, %ebx
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edi
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %ebx, 4(%eax)
+; FALLBACK23-NEXT: movl %ebp, 24(%eax)
+; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT: movl %ebx, 28(%eax)
+; FALLBACK23-NEXT: movl %esi, 16(%eax)
+; FALLBACK23-NEXT: movl %edi, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 12(%eax)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, (%eax)
+; FALLBACK23-NEXT: addl $108, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movl %ecx, %eax
+; FALLBACK24-NEXT: shlb $3, %al
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %cl
+; FALLBACK24-NEXT: movzbl %cl, %ecx
+; FALLBACK24-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK24-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK24-NEXT: movl %ecx, %edi
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %esi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %ebx
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 28(%eax)
+; FALLBACK24-NEXT: movl %esi, 4(%eax)
+; FALLBACK24-NEXT: movl %edi, 24(%eax)
+; FALLBACK24-NEXT: movl %ebp, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $108, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: movzbl %al, %ebp
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl %edi, %esi
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %esi, 4(%ebp)
+; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: shrl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %edx, (%ebp)
+; FALLBACK25-NEXT: addl $108, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %edx
+; FALLBACK26-NEXT: shlb $3, %dl
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %edx, %eax
+; FALLBACK26-NEXT: notb %al
+; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT: movl %eax, %ebp
+; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK26-NEXT: movl %ebp, %ecx
+; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %ebx, %ebx
+; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %ebp, %ebx
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: movl %ecx, %edx
+; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT: orl %ebp, %edi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT: movl %eax, 28(%edx)
+; FALLBACK26-NEXT: movl %ecx, 4(%edx)
+; FALLBACK26-NEXT: movl %edi, 24(%edx)
+; FALLBACK26-NEXT: movl %ebx, 16(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%edx)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $108, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: movzbl %al, %ebx
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edi
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %ebx, 4(%eax)
+; FALLBACK27-NEXT: movl %ebp, 24(%eax)
+; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT: movl %ebx, 28(%eax)
+; FALLBACK27-NEXT: movl %esi, 16(%eax)
+; FALLBACK27-NEXT: movl %edi, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 12(%eax)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, (%eax)
+; FALLBACK27-NEXT: addl $108, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movl %ecx, %eax
+; FALLBACK28-NEXT: shlb $3, %al
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %cl
+; FALLBACK28-NEXT: movzbl %cl, %ecx
+; FALLBACK28-NEXT: movl 32(%esp,%ecx), %esi
+; FALLBACK28-NEXT: movl 36(%esp,%ecx), %ebx
+; FALLBACK28-NEXT: movl %ecx, %edi
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %esi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %ebx
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 28(%eax)
+; FALLBACK28-NEXT: movl %esi, 4(%eax)
+; FALLBACK28-NEXT: movl %edi, 24(%eax)
+; FALLBACK28-NEXT: movl %ebp, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $108, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: movzbl %al, %ebp
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl %edi, %esi
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %esi, 4(%ebp)
+; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: shrl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %edx, (%ebp)
+; FALLBACK29-NEXT: addl $108, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %edx
+; FALLBACK30-NEXT: shlb $3, %dl
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %edx, %eax
+; FALLBACK30-NEXT: notb %al
+; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT: movl %eax, %ebp
+; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
+; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax
+; FALLBACK30-NEXT: movl %ebp, %ecx
+; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %ebx, %ebx
+; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %ebp, %ebx
+; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: movl %ecx, %edx
+; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT: orl %ebp, %edi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT: movl %eax, 28(%edx)
+; FALLBACK30-NEXT: movl %ecx, 4(%edx)
+; FALLBACK30-NEXT: movl %edi, 24(%edx)
+; FALLBACK30-NEXT: movl %ebx, 16(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%edx)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $108, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: movzbl %al, %ebx
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edi
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %ebx, 4(%eax)
+; FALLBACK31-NEXT: movl %ebp, 24(%eax)
+; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT: movl %ebx, 28(%eax)
+; FALLBACK31-NEXT: movl %esi, 16(%eax)
+; FALLBACK31-NEXT: movl %edi, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 12(%eax)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, (%eax)
+; FALLBACK31-NEXT: addl $108, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: movl %esi, %eax
+; FALLBACK0-NEXT: shlb $5, %al
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $6, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: movl %esi, %ecx
+; FALLBACK1-NEXT: shlb $5, %cl
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $6, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: movl %esi, %eax
+; FALLBACK2-NEXT: shlb $5, %al
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $6, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: movl %esi, %ecx
+; FALLBACK3-NEXT: shlb $5, %cl
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $6, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: movl %ecx, %eax
+; FALLBACK4-NEXT: shlb $5, %al
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $6, %cl
+; FALLBACK4-NEXT: movzbl %cl, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: movl %eax, %ecx
+; FALLBACK5-NEXT: shlb $5, %cl
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $6, %al
+; FALLBACK5-NEXT: movzbl %al, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: shrq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: movl %ecx, %eax
+; FALLBACK6-NEXT: shlb $5, %al
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $6, %cl
+; FALLBACK6-NEXT: movzbl %cl, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: movl %eax, %ecx
+; FALLBACK7-NEXT: shlb $5, %cl
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $6, %al
+; FALLBACK7-NEXT: movzbl %al, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: movl %ecx, %eax
+; FALLBACK8-NEXT: shlb $5, %al
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $6, %cl
+; FALLBACK8-NEXT: movzbl %cl, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: movl %eax, %ecx
+; FALLBACK9-NEXT: shlb $5, %cl
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $6, %al
+; FALLBACK9-NEXT: movzbl %al, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: shrq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: movl %ecx, %eax
+; FALLBACK10-NEXT: shlb $5, %al
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $6, %cl
+; FALLBACK10-NEXT: movzbl %cl, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: movl %eax, %ecx
+; FALLBACK11-NEXT: shlb $5, %cl
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $6, %al
+; FALLBACK11-NEXT: movzbl %al, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: movl %ecx, %eax
+; FALLBACK12-NEXT: shlb $5, %al
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $6, %cl
+; FALLBACK12-NEXT: movzbl %cl, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: movl %eax, %ecx
+; FALLBACK13-NEXT: shlb $5, %cl
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $6, %al
+; FALLBACK13-NEXT: movzbl %al, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: shrq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: movl %ecx, %eax
+; FALLBACK14-NEXT: shlb $5, %al
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $6, %cl
+; FALLBACK14-NEXT: movzbl %cl, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_32bytes_dwordOff:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: movl %eax, %ecx
+; FALLBACK15-NEXT: shlb $5, %cl
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $6, %al
+; FALLBACK15-NEXT: movzbl %al, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; X86-SSE2-LABEL: lshr_32bytes_dwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl (%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%eax), %esi
+; X86-SSE2-NEXT: movl 12(%eax), %edi
+; X86-SSE2-NEXT: movl 16(%eax), %ebx
+; X86-SSE2-NEXT: movl 20(%eax), %ebp
+; X86-SSE2-NEXT: movl 24(%eax), %edx
+; X86-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movzbl (%eax), %eax
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-SSE2-NEXT: movl %edx, 28(%eax)
+; X86-SSE2-NEXT: movl %ebp, 16(%eax)
+; X86-SSE2-NEXT: movl %ebx, 20(%eax)
+; X86-SSE2-NEXT: movl %edi, 8(%eax)
+; X86-SSE2-NEXT: movl %esi, 12(%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, (%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-SSE2-NEXT: addl $92, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: lshr_32bytes_dwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: subl $76, %esp
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT: movups (%edx), %xmm0
+; X86-SSE42-NEXT: movups 16(%edx), %xmm1
+; X86-SSE42-NEXT: movzbl (%ecx), %ecx
+; X86-SSE42-NEXT: xorps %xmm2, %xmm2
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1
+; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: addl $76, %esp
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: lshr_32bytes_dwordOff:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: subl $76, %esp
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: movzbl (%ecx), %ecx
+; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovups %ymm0, (%esp)
+; X86-AVX-NEXT: andl $7, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1
+; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX-NEXT: addl $76, %esp
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 5
+ %res = lshr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
; X64-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $31, %esi
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $3, %esi
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rax, (%rdx)
; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: lshr_32bytes:
+; X64-SSE42-LABEL: lshr_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movzbl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $31, %eax
-; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: andl $3, %eax
+; X64-SSE42-NEXT: movups -72(%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT: movups -56(%rsp,%rax,8), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: lshr_32bytes:
+; X64-AVX-LABEL: lshr_32bytes_qwordOff:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: movzbl (%rsi), %eax
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $31, %eax
-; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1
+; X64-AVX-NEXT: andl $3, %eax
+; X64-AVX-NEXT: vmovups -72(%rsp,%rax,8), %xmm0
+; X64-AVX-NEXT: vmovups -56(%rsp,%rax,8), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_32bytes:
+; X86-SSE2-LABEL: lshr_32bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
+; X86-SSE2-NEXT: subl $92, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 8(%eax), %esi
; X86-SSE2-NEXT: movl 12(%eax), %edi
; X86-SSE2-NEXT: movl 16(%eax), %ebx
@@ -1148,35 +5833,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 28(%eax), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movzbl (%eax), %eax
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $3, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 24(%eax)
; X86-SSE2-NEXT: movl %edx, 28(%eax)
@@ -1186,18 +5866,18 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %esi, 12(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
+; X86-SSE2-NEXT: addl $92, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: lshr_32bytes:
+; X86-SSE42-LABEL: lshr_32bytes_qwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $64, %esp
+; X86-SSE42-NEXT: subl $76, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1205,21 +5885,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 16(%edx), %xmm1
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm2, %xmm2
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $31, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $64, %esp
+; X86-SSE42-NEXT: addl $76, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: lshr_32bytes:
+; X86-AVX-LABEL: lshr_32bytes_qwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $64, %esp
+; X86-AVX-NEXT: subl $76, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1228,137 +5908,2830 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $64, %esp
+; X86-AVX-NEXT: addl $76, %esp
; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i256 %qwordOff, 6
%res = lshr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_32bytes:
+; FALLBACK0-LABEL: shl_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: negb %sil
+; FALLBACK0-NEXT: movsbq %sil, %r10
+; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8
+; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq %r8, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10
+; FALLBACK0-NEXT: movq %r10, %rbx
+; FALLBACK0-NEXT: shrq %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: orq %r11, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: negb %sil
+; FALLBACK1-NEXT: movsbq %sil, %rax
+; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shldq %cl, %r8, %rax
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shlq %cl, %r8
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK1-NEXT: movq %r8, (%rdx)
+; FALLBACK1-NEXT: movq %rax, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: negb %sil
+; FALLBACK2-NEXT: movsbq %sil, %rsi
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8
+; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, (%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: negb %sil
+; FALLBACK3-NEXT: movsbq %sil, %rax
+; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shldq %cl, %r8, %rax
+; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rax, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: negb %cl
+; FALLBACK4-NEXT: movsbq %cl, %r8
+; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rdi
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r9, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT: movq %r8, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: movq %r9, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, (%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %r11, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: negb %al
+; FALLBACK5-NEXT: movsbq %al, %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq %r8, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shldq %cl, %r8, %rax
+; FALLBACK5-NEXT: movq %rax, 8(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: leal (,%rcx,8), %eax
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: negb %cl
+; FALLBACK6-NEXT: movsbq %cl, %rcx
+; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: shrq %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r8, %rcx
+; FALLBACK6-NEXT: shrq %r9
+; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, (%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: negb %al
+; FALLBACK7-NEXT: movsbq %al, %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r8, %rax
+; FALLBACK7-NEXT: movq %rax, 8(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: negb %cl
+; FALLBACK8-NEXT: movsbq %cl, %r8
+; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT: movq %r10, %rdi
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r9, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT: movq %r8, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: movq %r9, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, (%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %r11, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: negb %al
+; FALLBACK9-NEXT: movsbq %al, %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq %r8, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shldq %cl, %r8, %rax
+; FALLBACK9-NEXT: movq %rax, 8(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: leal (,%rcx,8), %eax
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: negb %cl
+; FALLBACK10-NEXT: movsbq %cl, %rcx
+; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: shrq %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r8, %rcx
+; FALLBACK10-NEXT: shrq %r9
+; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, (%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: negb %al
+; FALLBACK11-NEXT: movsbq %al, %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r8, %rax
+; FALLBACK11-NEXT: movq %rax, 8(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: negb %cl
+; FALLBACK12-NEXT: movsbq %cl, %r8
+; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT: movq %r10, %rdi
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r9, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT: movq %r8, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: movq %r9, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, (%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %r11, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: negb %al
+; FALLBACK13-NEXT: movsbq %al, %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq %r8, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shldq %cl, %r8, %rax
+; FALLBACK13-NEXT: movq %rax, 8(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: leal (,%rcx,8), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: negb %cl
+; FALLBACK14-NEXT: movsbq %cl, %rcx
+; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: shrq %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r8, %rcx
+; FALLBACK14-NEXT: shrq %r9
+; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, (%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: negb %al
+; FALLBACK15-NEXT: movsbq %al, %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r8, %rax
+; FALLBACK15-NEXT: movq %rax, 8(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: shl_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ecx), %esi
+; FALLBACK16-NEXT: movl 12(%ecx), %edi
+; FALLBACK16-NEXT: movl 16(%ecx), %ebx
+; FALLBACK16-NEXT: movb (%eax), %ah
+; FALLBACK16-NEXT: movl 20(%ecx), %ebp
+; FALLBACK16-NEXT: movl 24(%ecx), %edx
+; FALLBACK16-NEXT: movl 28(%ecx), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movb %ah, %ch
+; FALLBACK16-NEXT: shlb $3, %ch
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %ah
+; FALLBACK16-NEXT: negb %ah
+; FALLBACK16-NEXT: movsbl %ah, %ebx
+; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movb %ch, %dl
+; FALLBACK16-NEXT: notb %dl
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %eax
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %eax
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: orl %edi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi
+; FALLBACK16-NEXT: movl %edi, %ebx
+; FALLBACK16-NEXT: shrl %ebx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: orl %eax, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %edi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 24(%eax)
+; FALLBACK16-NEXT: movl %ebx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl %ebp, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl (%eax), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%eax), %esi
+; FALLBACK17-NEXT: movl 12(%eax), %edi
+; FALLBACK17-NEXT: movl 16(%eax), %ebx
+; FALLBACK17-NEXT: movb (%ecx), %ch
+; FALLBACK17-NEXT: movl 20(%eax), %ebp
+; FALLBACK17-NEXT: movl 24(%eax), %edx
+; FALLBACK17-NEXT: movl 28(%eax), %eax
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movb %ch, %cl
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %ch
+; FALLBACK17-NEXT: negb %ch
+; FALLBACK17-NEXT: movsbl %ch, %eax
+; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl %ebx, %esi
+; FALLBACK17-NEXT: shldl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi
+; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi
+; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx
+; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx
+; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi
+; FALLBACK17-NEXT: shldl %cl, %edx, %esi
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl %edx, 24(%eax)
+; FALLBACK17-NEXT: movl %esi, 28(%eax)
+; FALLBACK17-NEXT: movl %edi, 16(%eax)
+; FALLBACK17-NEXT: movl %ebp, 20(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, 8(%eax)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, 12(%eax)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: movl %ebx, (%eax)
+; FALLBACK17-NEXT: movl %edx, 4(%eax)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %esi
+; FALLBACK18-NEXT: movl 12(%eax), %edi
+; FALLBACK18-NEXT: movl 16(%eax), %ebp
+; FALLBACK18-NEXT: movzbl (%ebx), %ebx
+; FALLBACK18-NEXT: movl 20(%eax), %edx
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl 28(%eax), %eax
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, %edx
+; FALLBACK18-NEXT: shlb $3, %dl
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %bl
+; FALLBACK18-NEXT: negb %bl
+; FALLBACK18-NEXT: movsbl %bl, %esi
+; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edi
+; FALLBACK18-NEXT: movl %edx, %ecx
+; FALLBACK18-NEXT: notb %cl
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT: orl %edi, %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, %edi
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax
+; FALLBACK18-NEXT: orl %ebx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx
+; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %edi
+; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp
+; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT: orl %ebp, %esi
+; FALLBACK18-NEXT: shrl %ebx
+; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx
+; FALLBACK18-NEXT: orl %eax, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl %edx, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 28(%eax)
+; FALLBACK18-NEXT: movl %edi, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %esi
+; FALLBACK19-NEXT: movl 12(%ecx), %edi
+; FALLBACK19-NEXT: movl 16(%ecx), %ebp
+; FALLBACK19-NEXT: movzbl (%ebx), %ebx
+; FALLBACK19-NEXT: movl 20(%ecx), %edx
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl 28(%ecx), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %bl
+; FALLBACK19-NEXT: negb %bl
+; FALLBACK19-NEXT: movsbl %bl, %eax
+; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi
+; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx
+; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi
+; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx
+; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi
+; FALLBACK19-NEXT: shldl %cl, %edx, %esi
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl %edx, 24(%eax)
+; FALLBACK19-NEXT: movl %esi, 28(%eax)
+; FALLBACK19-NEXT: movl %edi, 16(%eax)
+; FALLBACK19-NEXT: movl %ebp, 20(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, 8(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, 12(%eax)
+; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload
+; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, (%eax)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT: movl %ebx, 4(%eax)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movzbl (%eax), %ecx
+; FALLBACK20-NEXT: movb %cl, %dh
+; FALLBACK20-NEXT: shlb $3, %dh
+; FALLBACK20-NEXT: xorps %xmm2, %xmm2
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %cl
+; FALLBACK20-NEXT: negb %cl
+; FALLBACK20-NEXT: movsbl %cl, %eax
+; FALLBACK20-NEXT: movl 84(%esp,%eax), %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movb %dh, %dl
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: movl 80(%esp,%eax), %esi
+; FALLBACK20-NEXT: movl %eax, %ebx
+; FALLBACK20-NEXT: movl %esi, %eax
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK20-NEXT: movl %ebp, %eax
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %esi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK20-NEXT: movl %ebx, %eax
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %esi
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: shrl %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %eax
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: orl %edi, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %edx, (%eax)
+; FALLBACK20-NEXT: movl %ebp, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl %ebx, 4(%eax)
+; FALLBACK20-NEXT: movl %esi, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $92, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: xorps %xmm2, %xmm2
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: negb %al
+; FALLBACK21-NEXT: movsbl %al, %ebp
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK21-NEXT: shldl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK21-NEXT: shldl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, %eax
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %esi, %eax
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp
+; FALLBACK21-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK21-NEXT: movl %ebp, 28(%edx)
+; FALLBACK21-NEXT: movl %eax, 24(%edx)
+; FALLBACK21-NEXT: movl %esi, %eax
+; FALLBACK21-NEXT: shll %cl, %eax
+; FALLBACK21-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK21-NEXT: movl %ebx, 4(%edx)
+; FALLBACK21-NEXT: movl %edi, 8(%edx)
+; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 12(%edx)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 16(%edx)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT: movl %ecx, 20(%edx)
+; FALLBACK21-NEXT: movl %eax, (%edx)
+; FALLBACK21-NEXT: addl $92, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: xorps %xmm2, %xmm2
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: negb %cl
+; FALLBACK22-NEXT: movsbl %cl, %edx
+; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK22-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT: movl %eax, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx
+; FALLBACK22-NEXT: movl %ecx, %esi
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi
+; FALLBACK22-NEXT: movl %esi, %edi
+; FALLBACK22-NEXT: shrl %edi
+; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %ecx, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK22-NEXT: movl %esi, %edi
+; FALLBACK22-NEXT: shrl %edi
+; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ecx, %ebp
+; FALLBACK22-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi
+; FALLBACK22-NEXT: movl %esi, %ecx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx
+; FALLBACK22-NEXT: shlxl %eax, %edx, %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: shrl %edx
+; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK22-NEXT: orl %edi, %edx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK22-NEXT: movl %edi, (%esi)
+; FALLBACK22-NEXT: movl %edx, 28(%esi)
+; FALLBACK22-NEXT: movl %eax, 24(%esi)
+; FALLBACK22-NEXT: movl %ecx, 4(%esi)
+; FALLBACK22-NEXT: movl %ebp, 8(%esi)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%esi)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 16(%esi)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%esi)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $92, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: xorps %xmm2, %xmm2
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: negb %al
+; FALLBACK23-NEXT: movsbl %al, %ebx
+; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx
+; FALLBACK23-NEXT: shldl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK23-NEXT: shldl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, %eax
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %esi, %eax
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK23-NEXT: movl %ebx, 28(%edx)
+; FALLBACK23-NEXT: movl %eax, 24(%edx)
+; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK23-NEXT: movl %ebp, 4(%edx)
+; FALLBACK23-NEXT: movl %edi, 8(%edx)
+; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 12(%edx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 16(%edx)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 20(%edx)
+; FALLBACK23-NEXT: movl %eax, (%edx)
+; FALLBACK23-NEXT: addl $92, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: movzbl (%eax), %ecx
+; FALLBACK24-NEXT: movb %cl, %dh
+; FALLBACK24-NEXT: shlb $3, %dh
+; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %cl
+; FALLBACK24-NEXT: negb %cl
+; FALLBACK24-NEXT: movsbl %cl, %eax
+; FALLBACK24-NEXT: movl 84(%esp,%eax), %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movb %dh, %dl
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: movl 80(%esp,%eax), %esi
+; FALLBACK24-NEXT: movl %eax, %ebx
+; FALLBACK24-NEXT: movl %esi, %eax
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %esi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK24-NEXT: movl %ebx, %eax
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %esi
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: shrl %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %eax
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: orl %edi, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %edx, (%eax)
+; FALLBACK24-NEXT: movl %ebp, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl %ebx, 4(%eax)
+; FALLBACK24-NEXT: movl %esi, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $92, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: negb %al
+; FALLBACK25-NEXT: movsbl %al, %ebp
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK25-NEXT: shldl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK25-NEXT: shldl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, %eax
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %esi, %eax
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp
+; FALLBACK25-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK25-NEXT: movl %ebp, 28(%edx)
+; FALLBACK25-NEXT: movl %eax, 24(%edx)
+; FALLBACK25-NEXT: movl %esi, %eax
+; FALLBACK25-NEXT: shll %cl, %eax
+; FALLBACK25-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK25-NEXT: movl %ebx, 4(%edx)
+; FALLBACK25-NEXT: movl %edi, 8(%edx)
+; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 12(%edx)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 16(%edx)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT: movl %ecx, 20(%edx)
+; FALLBACK25-NEXT: movl %eax, (%edx)
+; FALLBACK25-NEXT: addl $92, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: negb %cl
+; FALLBACK26-NEXT: movsbl %cl, %edx
+; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK26-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT: movl %eax, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx
+; FALLBACK26-NEXT: movl %ecx, %esi
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi
+; FALLBACK26-NEXT: movl %esi, %edi
+; FALLBACK26-NEXT: shrl %edi
+; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %ecx, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK26-NEXT: movl %esi, %edi
+; FALLBACK26-NEXT: shrl %edi
+; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ecx, %ebp
+; FALLBACK26-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi
+; FALLBACK26-NEXT: movl %esi, %ecx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %edi, %ecx
+; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx
+; FALLBACK26-NEXT: shlxl %eax, %edx, %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: shrl %edx
+; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK26-NEXT: orl %edi, %edx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK26-NEXT: movl %edi, (%esi)
+; FALLBACK26-NEXT: movl %edx, 28(%esi)
+; FALLBACK26-NEXT: movl %eax, 24(%esi)
+; FALLBACK26-NEXT: movl %ecx, 4(%esi)
+; FALLBACK26-NEXT: movl %ebp, 8(%esi)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%esi)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 16(%esi)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%esi)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $92, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: negb %al
+; FALLBACK27-NEXT: movsbl %al, %ebx
+; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx
+; FALLBACK27-NEXT: shldl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK27-NEXT: shldl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, %eax
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %esi, %eax
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK27-NEXT: movl %ebx, 28(%edx)
+; FALLBACK27-NEXT: movl %eax, 24(%edx)
+; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT: movl %ebp, 4(%edx)
+; FALLBACK27-NEXT: movl %edi, 8(%edx)
+; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 12(%edx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 16(%edx)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 20(%edx)
+; FALLBACK27-NEXT: movl %eax, (%edx)
+; FALLBACK27-NEXT: addl $92, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: movzbl (%eax), %ecx
+; FALLBACK28-NEXT: movb %cl, %dh
+; FALLBACK28-NEXT: shlb $3, %dh
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %cl
+; FALLBACK28-NEXT: negb %cl
+; FALLBACK28-NEXT: movsbl %cl, %eax
+; FALLBACK28-NEXT: movl 84(%esp,%eax), %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movb %dh, %dl
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: movl 80(%esp,%eax), %esi
+; FALLBACK28-NEXT: movl %eax, %ebx
+; FALLBACK28-NEXT: movl %esi, %eax
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %esi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx
+; FALLBACK28-NEXT: movl %ebx, %eax
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %esi
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: shrl %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %eax
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: orl %edi, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %edx, (%eax)
+; FALLBACK28-NEXT: movl %ebp, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl %ebx, 4(%eax)
+; FALLBACK28-NEXT: movl %esi, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $92, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: negb %al
+; FALLBACK29-NEXT: movsbl %al, %ebp
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx
+; FALLBACK29-NEXT: shldl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi
+; FALLBACK29-NEXT: shldl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, %eax
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %esi, %eax
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp
+; FALLBACK29-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK29-NEXT: movl %ebp, 28(%edx)
+; FALLBACK29-NEXT: movl %eax, 24(%edx)
+; FALLBACK29-NEXT: movl %esi, %eax
+; FALLBACK29-NEXT: shll %cl, %eax
+; FALLBACK29-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK29-NEXT: movl %ebx, 4(%edx)
+; FALLBACK29-NEXT: movl %edi, 8(%edx)
+; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 12(%edx)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 16(%edx)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT: movl %ecx, 20(%edx)
+; FALLBACK29-NEXT: movl %eax, (%edx)
+; FALLBACK29-NEXT: addl $92, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: negb %cl
+; FALLBACK30-NEXT: movsbl %cl, %edx
+; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK30-NEXT: movl %eax, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx
+; FALLBACK30-NEXT: movl %ecx, %esi
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx
+; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl %esi, %edi
+; FALLBACK30-NEXT: shrl %edi
+; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %ecx, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx
+; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl %esi, %edi
+; FALLBACK30-NEXT: shrl %edi
+; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ecx, %ebp
+; FALLBACK30-NEXT: shlxl %eax, %esi, %edi
+; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl %esi, %ecx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %edi, %ecx
+; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi
+; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx
+; FALLBACK30-NEXT: shlxl %eax, %edx, %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: shrl %edx
+; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
+; FALLBACK30-NEXT: orl %edi, %edx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK30-NEXT: movl %edi, (%esi)
+; FALLBACK30-NEXT: movl %edx, 28(%esi)
+; FALLBACK30-NEXT: movl %eax, 24(%esi)
+; FALLBACK30-NEXT: movl %ecx, 4(%esi)
+; FALLBACK30-NEXT: movl %ebp, 8(%esi)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%esi)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 16(%esi)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%esi)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $92, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: negb %al
+; FALLBACK31-NEXT: movsbl %al, %ebx
+; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx
+; FALLBACK31-NEXT: shldl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi
+; FALLBACK31-NEXT: shldl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: shldl %cl, %ebp, %edi
+; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, %eax
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %esi, %eax
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK31-NEXT: movl %ebx, 28(%edx)
+; FALLBACK31-NEXT: movl %eax, 24(%edx)
+; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT: movl %ebp, 4(%edx)
+; FALLBACK31-NEXT: movl %edi, 8(%edx)
+; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 12(%edx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 16(%edx)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 20(%edx)
+; FALLBACK31-NEXT: movl %eax, (%edx)
+; FALLBACK31-NEXT: addl $92, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: shl_32bytes_dwordOff:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: movl %esi, %eax
+; FALLBACK0-NEXT: shlb $5, %al
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: shlb $2, %sil
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: negb %sil
+; FALLBACK0-NEXT: movsbq %sil, %r10
+; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8
+; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq %r8, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10
+; FALLBACK0-NEXT: movq %r10, %rbx
+; FALLBACK0-NEXT: shrq %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: orq %r11, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_32bytes_dwordOff:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: movl %esi, %ecx
+; FALLBACK1-NEXT: shlb $5, %cl
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: shlb $2, %sil
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: negb %sil
+; FALLBACK1-NEXT: movsbq %sil, %rax
+; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shldq %cl, %r8, %rax
+; FALLBACK1-NEXT: shlq %cl, %r8
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK1-NEXT: movq %r8, (%rdx)
+; FALLBACK1-NEXT: movq %rax, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_32bytes_dwordOff:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: movl %esi, %eax
+; FALLBACK2-NEXT: shlb $5, %al
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: shlb $2, %sil
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: negb %sil
+; FALLBACK2-NEXT: movsbq %sil, %rsi
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8
+; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, (%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_32bytes_dwordOff:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: movl %esi, %ecx
+; FALLBACK3-NEXT: shlb $5, %cl
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: shlb $2, %sil
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: negb %sil
+; FALLBACK3-NEXT: movsbq %sil, %rax
+; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shldq %cl, %r8, %rax
+; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rax, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_32bytes_dwordOff:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movzbl (%rsi), %ecx
+; FALLBACK4-NEXT: movl %ecx, %eax
+; FALLBACK4-NEXT: shlb $5, %al
+; FALLBACK4-NEXT: xorps %xmm2, %xmm2
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: shlb $2, %cl
+; FALLBACK4-NEXT: andb $24, %cl
+; FALLBACK4-NEXT: negb %cl
+; FALLBACK4-NEXT: movsbq %cl, %r8
+; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rdi
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r9, %rdi
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK4-NEXT: movq %r8, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: movq %r9, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, (%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %r11, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_32bytes_dwordOff:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movzbl (%rsi), %eax
+; FALLBACK5-NEXT: movl %eax, %ecx
+; FALLBACK5-NEXT: shlb $5, %cl
+; FALLBACK5-NEXT: xorps %xmm2, %xmm2
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: shlb $2, %al
+; FALLBACK5-NEXT: andb $24, %al
+; FALLBACK5-NEXT: negb %al
+; FALLBACK5-NEXT: movsbq %al, %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq %r8, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: shldq %cl, %r8, %rax
+; FALLBACK5-NEXT: movq %rax, 8(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_32bytes_dwordOff:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movzbl (%rsi), %ecx
+; FALLBACK6-NEXT: movl %ecx, %eax
+; FALLBACK6-NEXT: shlb $5, %al
+; FALLBACK6-NEXT: xorps %xmm2, %xmm2
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: shlb $2, %cl
+; FALLBACK6-NEXT: andb $24, %cl
+; FALLBACK6-NEXT: negb %cl
+; FALLBACK6-NEXT: movsbq %cl, %rcx
+; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: shrq %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r8, %rcx
+; FALLBACK6-NEXT: shrq %r9
+; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, (%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_32bytes_dwordOff:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movzbl (%rsi), %eax
+; FALLBACK7-NEXT: movl %eax, %ecx
+; FALLBACK7-NEXT: shlb $5, %cl
+; FALLBACK7-NEXT: xorps %xmm2, %xmm2
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: shlb $2, %al
+; FALLBACK7-NEXT: andb $24, %al
+; FALLBACK7-NEXT: negb %al
+; FALLBACK7-NEXT: movsbq %al, %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r8, %rax
+; FALLBACK7-NEXT: movq %rax, 8(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_32bytes_dwordOff:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: movzbl (%rsi), %ecx
+; FALLBACK8-NEXT: movl %ecx, %eax
+; FALLBACK8-NEXT: shlb $5, %al
+; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: shlb $2, %cl
+; FALLBACK8-NEXT: andb $24, %cl
+; FALLBACK8-NEXT: negb %cl
+; FALLBACK8-NEXT: movsbq %cl, %r8
+; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK8-NEXT: movq %r10, %rdi
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r9, %rdi
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK8-NEXT: movq %r8, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: movq %r9, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, (%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %r11, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_32bytes_dwordOff:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: movzbl (%rsi), %eax
+; FALLBACK9-NEXT: movl %eax, %ecx
+; FALLBACK9-NEXT: shlb $5, %cl
+; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: shlb $2, %al
+; FALLBACK9-NEXT: andb $24, %al
+; FALLBACK9-NEXT: negb %al
+; FALLBACK9-NEXT: movsbq %al, %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq %r8, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: shldq %cl, %r8, %rax
+; FALLBACK9-NEXT: movq %rax, 8(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_32bytes_dwordOff:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: movzbl (%rsi), %ecx
+; FALLBACK10-NEXT: movl %ecx, %eax
+; FALLBACK10-NEXT: shlb $5, %al
+; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: shlb $2, %cl
+; FALLBACK10-NEXT: andb $24, %cl
+; FALLBACK10-NEXT: negb %cl
+; FALLBACK10-NEXT: movsbq %cl, %rcx
+; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: shrq %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r8, %rcx
+; FALLBACK10-NEXT: shrq %r9
+; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, (%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_32bytes_dwordOff:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: movzbl (%rsi), %eax
+; FALLBACK11-NEXT: movl %eax, %ecx
+; FALLBACK11-NEXT: shlb $5, %cl
+; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: shlb $2, %al
+; FALLBACK11-NEXT: andb $24, %al
+; FALLBACK11-NEXT: negb %al
+; FALLBACK11-NEXT: movsbq %al, %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r8, %rax
+; FALLBACK11-NEXT: movq %rax, 8(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_32bytes_dwordOff:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: movzbl (%rsi), %ecx
+; FALLBACK12-NEXT: movl %ecx, %eax
+; FALLBACK12-NEXT: shlb $5, %al
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: shlb $2, %cl
+; FALLBACK12-NEXT: andb $24, %cl
+; FALLBACK12-NEXT: negb %cl
+; FALLBACK12-NEXT: movsbq %cl, %r8
+; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
+; FALLBACK12-NEXT: movq %r10, %rdi
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r9, %rdi
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
+; FALLBACK12-NEXT: movq %r8, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: movq %r9, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, (%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %r11, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_32bytes_dwordOff:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: movzbl (%rsi), %eax
+; FALLBACK13-NEXT: movl %eax, %ecx
+; FALLBACK13-NEXT: shlb $5, %cl
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: shlb $2, %al
+; FALLBACK13-NEXT: andb $24, %al
+; FALLBACK13-NEXT: negb %al
+; FALLBACK13-NEXT: movsbq %al, %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq %r8, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: shldq %cl, %r8, %rax
+; FALLBACK13-NEXT: movq %rax, 8(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_32bytes_dwordOff:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: movzbl (%rsi), %ecx
+; FALLBACK14-NEXT: movl %ecx, %eax
+; FALLBACK14-NEXT: shlb $5, %al
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: shlb $2, %cl
+; FALLBACK14-NEXT: andb $24, %cl
+; FALLBACK14-NEXT: negb %cl
+; FALLBACK14-NEXT: movsbq %cl, %rcx
+; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
+; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
+; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
+; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: shrq %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r8, %rcx
+; FALLBACK14-NEXT: shrq %r9
+; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, (%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_32bytes_dwordOff:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: movzbl (%rsi), %eax
+; FALLBACK15-NEXT: movl %eax, %ecx
+; FALLBACK15-NEXT: shlb $5, %cl
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: shlb $2, %al
+; FALLBACK15-NEXT: andb $24, %al
+; FALLBACK15-NEXT: negb %al
+; FALLBACK15-NEXT: movsbq %al, %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
+; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r8, %rax
+; FALLBACK15-NEXT: movq %rax, 8(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; X86-SSE2-LABEL: shl_32bytes_dwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE2-NEXT: movl (%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%ebp), %esi
+; X86-SSE2-NEXT: movl 12(%ebp), %edi
+; X86-SSE2-NEXT: movl 16(%ebp), %ebx
+; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: movl 20(%ebp), %edx
+; X86-SSE2-NEXT: movl 24(%ebp), %eax
+; X86-SSE2-NEXT: movl 28(%ebp), %ebp
+; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: shlb $2, %cl
+; X86-SSE2-NEXT: andb $28, %cl
+; X86-SSE2-NEXT: negb %cl
+; X86-SSE2-NEXT: movsbl %cl, %edx
+; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi
+; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi
+; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx
+; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp
+; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %edx, 24(%eax)
+; X86-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-SSE2-NEXT: movl %ebp, 16(%eax)
+; X86-SSE2-NEXT: movl %ebx, 20(%eax)
+; X86-SSE2-NEXT: movl %edi, 8(%eax)
+; X86-SSE2-NEXT: movl %esi, 12(%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, (%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-SSE2-NEXT: addl $92, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: shl_32bytes_dwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: subl $76, %esp
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT: movups (%edx), %xmm0
+; X86-SSE42-NEXT: movups 16(%edx), %xmm1
+; X86-SSE42-NEXT: movzbl (%ecx), %ecx
+; X86-SSE42-NEXT: xorps %xmm2, %xmm2
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, (%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shlb $2, %cl
+; X86-SSE42-NEXT: andb $28, %cl
+; X86-SSE42-NEXT: negb %cl
+; X86-SSE42-NEXT: movsbl %cl, %ecx
+; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0
+; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: addl $76, %esp
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: shl_32bytes_dwordOff:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: subl $76, %esp
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: vmovups (%edx), %ymm0
+; X86-AVX-NEXT: movzbl (%ecx), %ecx
+; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vmovups %ymm1, (%esp)
+; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: shlb $2, %cl
+; X86-AVX-NEXT: andb $28, %cl
+; X86-AVX-NEXT: negb %cl
+; X86-AVX-NEXT: movsbl %cl, %ecx
+; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
+; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX-NEXT: addl $76, %esp
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 5
+ %res = shl i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
; X64-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andb $31, %sil
+; X64-SSE2-NEXT: shlb $3, %sil
+; X64-SSE2-NEXT: andb $24, %sil
; X64-SSE2-NEXT: negb %sil
; X64-SSE2-NEXT: movsbq %sil, %rax
-; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx
-; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi
-; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
+; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rcx
+; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rsi
+; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rax
; X64-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rcx, (%rdx)
; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: shl_32bytes:
+; X64-SSE42-LABEL: shl_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movzbl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andb $31, %al
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: shlb $3, %al
+; X64-SSE42-NEXT: andb $24, %al
; X64-SSE42-NEXT: negb %al
; X64-SSE42-NEXT: movsbq %al, %rax
-; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT: movups -40(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT: movups -24(%rsp,%rax), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: shl_32bytes:
+; X64-AVX-LABEL: shl_32bytes_qwordOff:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: movzbl (%rsi), %eax
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andb $31, %al
+; X64-AVX-NEXT: shlb $3, %al
+; X64-AVX-NEXT: andb $24, %al
; X64-AVX-NEXT: negb %al
; X64-AVX-NEXT: movsbq %al, %rax
-; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1
+; X64-AVX-NEXT: vmovups -40(%rsp,%rax), %xmm0
+; X64-AVX-NEXT: vmovups -24(%rsp,%rax), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: shl_32bytes:
+; X86-SSE2-LABEL: shl_32bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT: movl (%edi), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%edi), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%edi), %esi
-; X86-SSE2-NEXT: movl 12(%edi), %ebx
-; X86-SSE2-NEXT: movl 16(%edi), %ebp
-; X86-SSE2-NEXT: movzbl (%eax), %eax
-; X86-SSE2-NEXT: movl 20(%edi), %edx
-; X86-SSE2-NEXT: movl 24(%edi), %ecx
-; X86-SSE2-NEXT: movl 28(%edi), %edi
-; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-SSE2-NEXT: movl (%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%ebp), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%ebp), %esi
+; X86-SSE2-NEXT: movl 12(%ebp), %edi
+; X86-SSE2-NEXT: movl 16(%ebp), %ebx
+; X86-SSE2-NEXT: movzbl (%ecx), %ecx
+; X86-SSE2-NEXT: movl 20(%ebp), %edx
+; X86-SSE2-NEXT: movl 24(%ebp), %eax
+; X86-SSE2-NEXT: movl 28(%ebp), %ebp
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andb $31, %al
-; X86-SSE2-NEXT: negb %al
-; X86-SSE2-NEXT: movsbl %al, %edx
-; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: shlb $3, %cl
+; X86-SSE2-NEXT: andb $24, %cl
+; X86-SSE2-NEXT: negb %cl
+; X86-SSE2-NEXT: movsbl %cl, %edx
+; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax
; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax
-; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi
-; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi
-; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx
-; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp
-; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx
-; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx
+; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi
+; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi
+; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx
+; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp
+; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %edx, 24(%eax)
; X86-SSE2-NEXT: movl %ecx, 28(%eax)
@@ -1368,18 +8741,18 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %esi, 12(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
+; X86-SSE2-NEXT: addl $92, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: shl_32bytes:
+; X86-SSE42-LABEL: shl_32bytes_qwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $64, %esp
+; X86-SSE42-NEXT: subl $76, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1387,23 +8760,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 16(%edx), %xmm1
; X86-SSE42-NEXT: movzbl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm2, %xmm2
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, (%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andb $31, %cl
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, (%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shlb $3, %cl
+; X86-SSE42-NEXT: andb $24, %cl
; X86-SSE42-NEXT: negb %cl
; X86-SSE42-NEXT: movsbl %cl, %ecx
; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0
; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $64, %esp
+; X86-SSE42-NEXT: addl $76, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: shl_32bytes:
+; X86-AVX-LABEL: shl_32bytes_qwordOff:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: subl $64, %esp
+; X86-AVX-NEXT: subl $76, %esp
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1412,25 +8786,3037 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT: vmovups %ymm1, (%esp)
; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andb $31, %cl
+; X86-AVX-NEXT: shlb $3, %cl
+; X86-AVX-NEXT: andb $24, %cl
; X86-AVX-NEXT: negb %cl
; X86-AVX-NEXT: movsbl %cl, %ecx
; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $64, %esp
+; X86-AVX-NEXT: addl $76, %esp
; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i256 %qwordOff, 6
%res = shl i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_32bytes:
+; FALLBACK0-LABEL: ashr_32bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %rdi
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $24, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_32bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rdi
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $24, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_32bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $24, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_32bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rdi
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $24, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_32bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movq 16(%rdi), %rcx
+; FALLBACK4-NEXT: movq 24(%rdi), %rdi
+; FALLBACK4-NEXT: movzbl (%rsi), %esi
+; FALLBACK4-NEXT: leal (,%rsi,8), %eax
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rdi
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $24, %sil
+; FALLBACK4-NEXT: movzbl %sil, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_32bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movq 16(%rdi), %rax
+; FALLBACK5-NEXT: movq 24(%rdi), %rdi
+; FALLBACK5-NEXT: movzbl (%rsi), %esi
+; FALLBACK5-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $24, %sil
+; FALLBACK5-NEXT: movzbl %sil, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: sarq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_32bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movq 16(%rdi), %rcx
+; FALLBACK6-NEXT: movq 24(%rdi), %rdi
+; FALLBACK6-NEXT: movzbl (%rsi), %esi
+; FALLBACK6-NEXT: leal (,%rsi,8), %eax
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rdi
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $24, %sil
+; FALLBACK6-NEXT: movzbl %sil, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_32bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movq 16(%rdi), %rax
+; FALLBACK7-NEXT: movq 24(%rdi), %rdi
+; FALLBACK7-NEXT: movzbl (%rsi), %esi
+; FALLBACK7-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $24, %sil
+; FALLBACK7-NEXT: movzbl %sil, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_32bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT: movq 16(%rdi), %rcx
+; FALLBACK8-NEXT: movq 24(%rdi), %rdi
+; FALLBACK8-NEXT: movzbl (%rsi), %esi
+; FALLBACK8-NEXT: leal (,%rsi,8), %eax
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rdi
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $24, %sil
+; FALLBACK8-NEXT: movzbl %sil, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_32bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT: movq 16(%rdi), %rax
+; FALLBACK9-NEXT: movq 24(%rdi), %rdi
+; FALLBACK9-NEXT: movzbl (%rsi), %esi
+; FALLBACK9-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $24, %sil
+; FALLBACK9-NEXT: movzbl %sil, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: sarq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_32bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT: movq 16(%rdi), %rcx
+; FALLBACK10-NEXT: movq 24(%rdi), %rdi
+; FALLBACK10-NEXT: movzbl (%rsi), %esi
+; FALLBACK10-NEXT: leal (,%rsi,8), %eax
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rdi
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $24, %sil
+; FALLBACK10-NEXT: movzbl %sil, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_32bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT: movq 16(%rdi), %rax
+; FALLBACK11-NEXT: movq 24(%rdi), %rdi
+; FALLBACK11-NEXT: movzbl (%rsi), %esi
+; FALLBACK11-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $24, %sil
+; FALLBACK11-NEXT: movzbl %sil, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_32bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT: movq 16(%rdi), %rcx
+; FALLBACK12-NEXT: movq 24(%rdi), %rdi
+; FALLBACK12-NEXT: movzbl (%rsi), %esi
+; FALLBACK12-NEXT: leal (,%rsi,8), %eax
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rdi
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $24, %sil
+; FALLBACK12-NEXT: movzbl %sil, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_32bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT: movq 16(%rdi), %rax
+; FALLBACK13-NEXT: movq 24(%rdi), %rdi
+; FALLBACK13-NEXT: movzbl (%rsi), %esi
+; FALLBACK13-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $24, %sil
+; FALLBACK13-NEXT: movzbl %sil, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: sarq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_32bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT: movq 16(%rdi), %rcx
+; FALLBACK14-NEXT: movq 24(%rdi), %rdi
+; FALLBACK14-NEXT: movzbl (%rsi), %esi
+; FALLBACK14-NEXT: leal (,%rsi,8), %eax
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rdi
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $24, %sil
+; FALLBACK14-NEXT: movzbl %sil, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_32bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT: movq 16(%rdi), %rax
+; FALLBACK15-NEXT: movq 24(%rdi), %rdi
+; FALLBACK15-NEXT: movzbl (%rsi), %esi
+; FALLBACK15-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $24, %sil
+; FALLBACK15-NEXT: movzbl %sil, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: ashr_32bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $108, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK16-NEXT: movl (%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%esi), %ebx
+; FALLBACK16-NEXT: movl 12(%esi), %ebp
+; FALLBACK16-NEXT: movl 16(%esi), %edi
+; FALLBACK16-NEXT: movzbl (%eax), %ecx
+; FALLBACK16-NEXT: movl 20(%esi), %edx
+; FALLBACK16-NEXT: movl 24(%esi), %eax
+; FALLBACK16-NEXT: movl 28(%esi), %esi
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, %edx
+; FALLBACK16-NEXT: shlb $3, %dl
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: sarl $31, %esi
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: andb $28, %cl
+; FALLBACK16-NEXT: movzbl %cl, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi
+; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax
+; FALLBACK16-NEXT: movl %eax, %ebx
+; FALLBACK16-NEXT: movl %edx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movb %dl, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %eax, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl %edx, %ebx
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%edx,%edx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %esi, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebx, %edx
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, %eax
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax
+; FALLBACK16-NEXT: leal (%eax,%eax), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK16-NEXT: sarl %cl, %eax
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl %eax, 28(%ecx)
+; FALLBACK16-NEXT: movl %edx, 24(%ecx)
+; FALLBACK16-NEXT: movl %edi, 16(%ecx)
+; FALLBACK16-NEXT: movl %esi, 20(%ecx)
+; FALLBACK16-NEXT: movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 12(%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, (%ecx)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, 4(%ecx)
+; FALLBACK16-NEXT: addl $108, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: ashr_32bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $92, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %edx
+; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %ebp
+; FALLBACK17-NEXT: movl 16(%ecx), %ebx
+; FALLBACK17-NEXT: movzbl (%eax), %eax
+; FALLBACK17-NEXT: movl 20(%ecx), %edi
+; FALLBACK17-NEXT: movl 24(%ecx), %edx
+; FALLBACK17-NEXT: movl 28(%ecx), %esi
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, %ecx
+; FALLBACK17-NEXT: shlb $3, %cl
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: sarl $31, %esi
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: andb $28, %al
+; FALLBACK17-NEXT: movzbl %al, %ebp
+; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
+; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT: sarl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK17-NEXT: movl %edi, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %esi, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $92, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: ashr_32bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $108, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FALLBACK18-NEXT: movl (%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%esi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%esi), %ebx
+; FALLBACK18-NEXT: movl 12(%esi), %ebp
+; FALLBACK18-NEXT: movl 16(%esi), %edi
+; FALLBACK18-NEXT: movzbl (%ecx), %ecx
+; FALLBACK18-NEXT: movl 20(%esi), %edx
+; FALLBACK18-NEXT: movl 24(%esi), %eax
+; FALLBACK18-NEXT: movl 28(%esi), %esi
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, %eax
+; FALLBACK18-NEXT: shlb $3, %al
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: sarl $31, %esi
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: andb $28, %cl
+; FALLBACK18-NEXT: movzbl %cl, %edi
+; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx
+; FALLBACK18-NEXT: movl %eax, %edx
+; FALLBACK18-NEXT: notb %dl
+; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp
+; FALLBACK18-NEXT: orl %ebx, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi
+; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp
+; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx
+; FALLBACK18-NEXT: orl %ebx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx
+; FALLBACK18-NEXT: movl %eax, %ebx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: orl %ecx, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi
+; FALLBACK18-NEXT: orl %esi, %ecx
+; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %esi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %edx, %edi, %edx
+; FALLBACK18-NEXT: orl %eax, %edx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %ebx, 28(%eax)
+; FALLBACK18-NEXT: movl %edx, 24(%eax)
+; FALLBACK18-NEXT: movl %esi, 16(%eax)
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $108, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: ashr_32bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $92, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %edx
+; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ecx), %ebp
+; FALLBACK19-NEXT: movl 16(%ecx), %ebx
+; FALLBACK19-NEXT: movzbl (%eax), %eax
+; FALLBACK19-NEXT: movl 20(%ecx), %edi
+; FALLBACK19-NEXT: movl 24(%ecx), %edx
+; FALLBACK19-NEXT: movl 28(%ecx), %esi
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, %ecx
+; FALLBACK19-NEXT: shlb $3, %cl
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: sarl $31, %esi
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: andb $28, %al
+; FALLBACK19-NEXT: movzbl %al, %ebp
+; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %esi, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl %edx, %esi
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 16(%ebp)
+; FALLBACK19-NEXT: movl %esi, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 4(%ebp)
+; FALLBACK19-NEXT: addl $92, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: ashr_32bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $108, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movl 16(%ecx), %esi
+; FALLBACK20-NEXT: movl 20(%ecx), %edi
+; FALLBACK20-NEXT: movl 24(%ecx), %ebx
+; FALLBACK20-NEXT: movl 28(%ecx), %edx
+; FALLBACK20-NEXT: movzbl (%eax), %eax
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shlb $3, %cl
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: sarl $31, %edx
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: andb $28, %al
+; FALLBACK20-NEXT: movzbl %al, %edi
+; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax
+; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl %ecx, %edx
+; FALLBACK20-NEXT: movb %cl, %dh
+; FALLBACK20-NEXT: notb %dl
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %eax, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %eax, %esi
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT: movl %ebp, %eax
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebp, %ebp
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %eax, %ebp
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %edi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movl %edx, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dh, %cl
+; FALLBACK20-NEXT: sarl %cl, %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movl %eax, 28(%ecx)
+; FALLBACK20-NEXT: movl %esi, 4(%ecx)
+; FALLBACK20-NEXT: movl %edi, 24(%ecx)
+; FALLBACK20-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 20(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 8(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, 12(%ecx)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movl %eax, (%ecx)
+; FALLBACK20-NEXT: addl $108, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: ashr_32bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $108, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movl 16(%ecx), %esi
+; FALLBACK21-NEXT: movl 20(%ecx), %edi
+; FALLBACK21-NEXT: movl 24(%ecx), %ebx
+; FALLBACK21-NEXT: movl 28(%ecx), %edx
+; FALLBACK21-NEXT: movzbl (%eax), %eax
+; FALLBACK21-NEXT: movl %eax, %ecx
+; FALLBACK21-NEXT: shlb $3, %cl
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: sarl $31, %edx
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: andb $28, %al
+; FALLBACK21-NEXT: movzbl %al, %ebp
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl %edi, %esi
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %esi, 4(%ebp)
+; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: sarl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %edx, (%ebp)
+; FALLBACK21-NEXT: addl $108, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: ashr_32bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $108, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movl 16(%ecx), %esi
+; FALLBACK22-NEXT: movl 20(%ecx), %edi
+; FALLBACK22-NEXT: movl 24(%ecx), %ebx
+; FALLBACK22-NEXT: movl 28(%ecx), %edx
+; FALLBACK22-NEXT: movzbl (%eax), %ecx
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shlb $3, %al
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: sarl $31, %edx
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: andb $28, %cl
+; FALLBACK22-NEXT: movzbl %cl, %edi
+; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %eax, %edx
+; FALLBACK22-NEXT: notb %dl
+; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: orl %ebx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax
+; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %ebx, %ebx
+; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT: orl %ebp, %ebx
+; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK22-NEXT: sarxl %eax, %edi, %eax
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK22-NEXT: orl %ecx, %edi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: addl %ecx, %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %esi, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK22-NEXT: movl %eax, 28(%edx)
+; FALLBACK22-NEXT: movl %ecx, 4(%edx)
+; FALLBACK22-NEXT: movl %edi, 24(%edx)
+; FALLBACK22-NEXT: movl %ebx, 16(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 20(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 8(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, 12(%edx)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: movl %eax, (%edx)
+; FALLBACK22-NEXT: addl $108, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: ashr_32bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $108, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movl 16(%ecx), %esi
+; FALLBACK23-NEXT: movl 20(%ecx), %edi
+; FALLBACK23-NEXT: movl 24(%ecx), %ebx
+; FALLBACK23-NEXT: movl 28(%ecx), %edx
+; FALLBACK23-NEXT: movzbl (%eax), %eax
+; FALLBACK23-NEXT: movl %eax, %ecx
+; FALLBACK23-NEXT: shlb $3, %cl
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: sarl $31, %edx
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: andb $28, %al
+; FALLBACK23-NEXT: movzbl %al, %ebx
+; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, %edi
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %ebx, 4(%eax)
+; FALLBACK23-NEXT: movl %ebp, 24(%eax)
+; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK23-NEXT: movl %ebx, 28(%eax)
+; FALLBACK23-NEXT: movl %esi, 16(%eax)
+; FALLBACK23-NEXT: movl %edi, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: movl %esi, 12(%eax)
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, (%eax)
+; FALLBACK23-NEXT: addl $108, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: ashr_32bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $108, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK24-NEXT: movl 16(%ecx), %esi
+; FALLBACK24-NEXT: movl 20(%ecx), %edi
+; FALLBACK24-NEXT: movl 24(%ecx), %ebx
+; FALLBACK24-NEXT: movl 28(%ecx), %edx
+; FALLBACK24-NEXT: movzbl (%eax), %eax
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shlb $3, %cl
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: sarl $31, %edx
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: andb $28, %al
+; FALLBACK24-NEXT: movzbl %al, %edi
+; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax
+; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl %ecx, %edx
+; FALLBACK24-NEXT: movb %cl, %dh
+; FALLBACK24-NEXT: notb %dl
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %eax, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %eax, %esi
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebp, %ebp
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %eax, %ebp
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %edi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movl %edx, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dh, %cl
+; FALLBACK24-NEXT: sarl %cl, %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: movl %eax, 28(%ecx)
+; FALLBACK24-NEXT: movl %esi, 4(%ecx)
+; FALLBACK24-NEXT: movl %edi, 24(%ecx)
+; FALLBACK24-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 20(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 8(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, 12(%ecx)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movl %eax, (%ecx)
+; FALLBACK24-NEXT: addl $108, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: ashr_32bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $108, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK25-NEXT: movl 16(%ecx), %esi
+; FALLBACK25-NEXT: movl 20(%ecx), %edi
+; FALLBACK25-NEXT: movl 24(%ecx), %ebx
+; FALLBACK25-NEXT: movl 28(%ecx), %edx
+; FALLBACK25-NEXT: movzbl (%eax), %eax
+; FALLBACK25-NEXT: movl %eax, %ecx
+; FALLBACK25-NEXT: shlb $3, %cl
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: sarl $31, %edx
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: andb $28, %al
+; FALLBACK25-NEXT: movzbl %al, %ebp
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl %edi, %esi
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %esi, 4(%ebp)
+; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: sarl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %edx, (%ebp)
+; FALLBACK25-NEXT: addl $108, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: ashr_32bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $108, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK26-NEXT: movl 16(%ecx), %esi
+; FALLBACK26-NEXT: movl 20(%ecx), %edi
+; FALLBACK26-NEXT: movl 24(%ecx), %ebx
+; FALLBACK26-NEXT: movl 28(%ecx), %edx
+; FALLBACK26-NEXT: movzbl (%eax), %ecx
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shlb $3, %al
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: sarl $31, %edx
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: andb $28, %cl
+; FALLBACK26-NEXT: movzbl %cl, %edi
+; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %eax, %edx
+; FALLBACK26-NEXT: notb %dl
+; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: orl %ebx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax
+; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %ecx, %eax
+; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %ebx, %ebx
+; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT: orl %ebp, %ebx
+; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK26-NEXT: sarxl %eax, %edi, %eax
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK26-NEXT: orl %ecx, %edi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %esi, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK26-NEXT: movl %eax, 28(%edx)
+; FALLBACK26-NEXT: movl %ecx, 4(%edx)
+; FALLBACK26-NEXT: movl %edi, 24(%edx)
+; FALLBACK26-NEXT: movl %ebx, 16(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%edx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%edx)
+; FALLBACK26-NEXT: addl $108, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: ashr_32bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $108, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK27-NEXT: movl 16(%ecx), %esi
+; FALLBACK27-NEXT: movl 20(%ecx), %edi
+; FALLBACK27-NEXT: movl 24(%ecx), %ebx
+; FALLBACK27-NEXT: movl 28(%ecx), %edx
+; FALLBACK27-NEXT: movzbl (%eax), %eax
+; FALLBACK27-NEXT: movl %eax, %ecx
+; FALLBACK27-NEXT: shlb $3, %cl
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: sarl $31, %edx
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: andb $28, %al
+; FALLBACK27-NEXT: movzbl %al, %ebx
+; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, %edi
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %ebx, 4(%eax)
+; FALLBACK27-NEXT: movl %ebp, 24(%eax)
+; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK27-NEXT: movl %ebx, 28(%eax)
+; FALLBACK27-NEXT: movl %esi, 16(%eax)
+; FALLBACK27-NEXT: movl %edi, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: movl %esi, 12(%eax)
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, (%eax)
+; FALLBACK27-NEXT: addl $108, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: ashr_32bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $108, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK28-NEXT: movl 16(%ecx), %esi
+; FALLBACK28-NEXT: movl 20(%ecx), %edi
+; FALLBACK28-NEXT: movl 24(%ecx), %ebx
+; FALLBACK28-NEXT: movl 28(%ecx), %edx
+; FALLBACK28-NEXT: movzbl (%eax), %eax
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shlb $3, %cl
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: sarl $31, %edx
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: andb $28, %al
+; FALLBACK28-NEXT: movzbl %al, %edi
+; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax
+; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl %ecx, %edx
+; FALLBACK28-NEXT: movb %cl, %dh
+; FALLBACK28-NEXT: notb %dl
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %eax, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %eax, %esi
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebp, %ebp
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %eax, %ebp
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %edi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movl %edx, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dh, %cl
+; FALLBACK28-NEXT: sarl %cl, %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: movl %eax, 28(%ecx)
+; FALLBACK28-NEXT: movl %esi, 4(%ecx)
+; FALLBACK28-NEXT: movl %edi, 24(%ecx)
+; FALLBACK28-NEXT: movl %ebp, 16(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 20(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 8(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, 12(%ecx)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movl %eax, (%ecx)
+; FALLBACK28-NEXT: addl $108, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: ashr_32bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $108, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK29-NEXT: movl 16(%ecx), %esi
+; FALLBACK29-NEXT: movl 20(%ecx), %edi
+; FALLBACK29-NEXT: movl 24(%ecx), %ebx
+; FALLBACK29-NEXT: movl 28(%ecx), %edx
+; FALLBACK29-NEXT: movzbl (%eax), %eax
+; FALLBACK29-NEXT: movl %eax, %ecx
+; FALLBACK29-NEXT: shlb $3, %cl
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: sarl $31, %edx
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: andb $28, %al
+; FALLBACK29-NEXT: movzbl %al, %ebp
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl %edi, %esi
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %esi, 4(%ebp)
+; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: sarl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %edx, (%ebp)
+; FALLBACK29-NEXT: addl $108, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: ashr_32bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $108, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK30-NEXT: movl 16(%ecx), %esi
+; FALLBACK30-NEXT: movl 20(%ecx), %edi
+; FALLBACK30-NEXT: movl 24(%ecx), %ebx
+; FALLBACK30-NEXT: movl 28(%ecx), %edx
+; FALLBACK30-NEXT: movzbl (%eax), %ecx
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shlb $3, %al
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: sarl $31, %edx
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: andb $28, %cl
+; FALLBACK30-NEXT: movzbl %cl, %edi
+; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %eax, %edx
+; FALLBACK30-NEXT: notb %dl
+; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
+; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: orl %ebx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
+; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax
+; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
+; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %ecx, %eax
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %ebx, %ebx
+; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT: orl %ebp, %ebx
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
+; FALLBACK30-NEXT: sarxl %eax, %edi, %eax
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %edx, %edi, %edi
+; FALLBACK30-NEXT: orl %ecx, %edi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: addl %ecx, %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %esi, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FALLBACK30-NEXT: movl %eax, 28(%edx)
+; FALLBACK30-NEXT: movl %ecx, 4(%edx)
+; FALLBACK30-NEXT: movl %edi, 24(%edx)
+; FALLBACK30-NEXT: movl %ebx, 16(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%edx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%edx)
+; FALLBACK30-NEXT: addl $108, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: ashr_32bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $108, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %xmm0
+; FALLBACK31-NEXT: movl 16(%ecx), %esi
+; FALLBACK31-NEXT: movl 20(%ecx), %edi
+; FALLBACK31-NEXT: movl 24(%ecx), %ebx
+; FALLBACK31-NEXT: movl 28(%ecx), %edx
+; FALLBACK31-NEXT: movzbl (%eax), %eax
+; FALLBACK31-NEXT: movl %eax, %ecx
+; FALLBACK31-NEXT: shlb $3, %cl
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: sarl $31, %edx
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: andb $28, %al
+; FALLBACK31-NEXT: movzbl %al, %ebx
+; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
+; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
+; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, %edi
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
+; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
+; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %ebx, 4(%eax)
+; FALLBACK31-NEXT: movl %ebp, 24(%eax)
+; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; FALLBACK31-NEXT: movl %ebx, 28(%eax)
+; FALLBACK31-NEXT: movl %esi, 16(%eax)
+; FALLBACK31-NEXT: movl %edi, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: movl %esi, 12(%eax)
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, (%eax)
+; FALLBACK31-NEXT: addl $108, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %byteOff = load i256, ptr %byteOff.ptr, align 1
+ %bitOff = shl i256 %byteOff, 3
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
+; FALLBACK0-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rcx
+; FALLBACK0-NEXT: movq 8(%rdi), %r8
+; FALLBACK0-NEXT: movq 16(%rdi), %r9
+; FALLBACK0-NEXT: movq 24(%rdi), %rdi
+; FALLBACK0-NEXT: movzbl (%rsi), %esi
+; FALLBACK0-NEXT: movl %esi, %eax
+; FALLBACK0-NEXT: shlb $5, %al
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %rdi
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: andb $6, %sil
+; FALLBACK0-NEXT: movzbl %sil, %r9d
+; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r11, %r8
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %rdi, %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rdi
+; FALLBACK0-NEXT: orq %r10, %rdi
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %r9
+; FALLBACK0-NEXT: movq %r9, 24(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %rdi, (%rdx)
+; FALLBACK0-NEXT: movq %r8, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %rdi
+; FALLBACK1-NEXT: movzbl (%rsi), %esi
+; FALLBACK1-NEXT: movl %esi, %ecx
+; FALLBACK1-NEXT: shlb $5, %cl
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rdi
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: andb $6, %sil
+; FALLBACK1-NEXT: movzbl %sil, %eax
+; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK1-NEXT: movq %r8, %r9
+; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rax, 24(%rdx)
+; FALLBACK1-NEXT: movq %rdi, (%rdx)
+; FALLBACK1-NEXT: movq %r9, 8(%rdx)
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %rdi
+; FALLBACK2-NEXT: movzbl (%rsi), %esi
+; FALLBACK2-NEXT: movl %esi, %eax
+; FALLBACK2-NEXT: shlb $5, %al
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: andb $6, %sil
+; FALLBACK2-NEXT: movzbl %sil, %ecx
+; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi
+; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi
+; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
+; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9
+; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK2-NEXT: notb %al
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r8, %rdi
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r9, %rsi
+; FALLBACK2-NEXT: addq %rcx, %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
+; FALLBACK2-NEXT: orq %r10, %rax
+; FALLBACK2-NEXT: movq %r11, 24(%rdx)
+; FALLBACK2-NEXT: movq %rax, 16(%rdx)
+; FALLBACK2-NEXT: movq %rsi, (%rdx)
+; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %rdi
+; FALLBACK3-NEXT: movzbl (%rsi), %esi
+; FALLBACK3-NEXT: movl %esi, %ecx
+; FALLBACK3-NEXT: shlb $5, %cl
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rdi
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: andb $6, %sil
+; FALLBACK3-NEXT: movzbl %sil, %eax
+; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi
+; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi
+; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8
+; FALLBACK3-NEXT: movq %r8, %r9
+; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
+; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
+; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rax, 24(%rdx)
+; FALLBACK3-NEXT: movq %rdi, (%rdx)
+; FALLBACK3-NEXT: movq %r9, 8(%rdx)
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movq 16(%rdi), %rcx
+; FALLBACK4-NEXT: movq 24(%rdi), %rdi
+; FALLBACK4-NEXT: movzbl (%rsi), %esi
+; FALLBACK4-NEXT: movl %esi, %eax
+; FALLBACK4-NEXT: shlb $5, %al
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rdi
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: andb $6, %sil
+; FALLBACK4-NEXT: movzbl %sil, %r9d
+; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK4-NEXT: movq %r10, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r11, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r8, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %r9
+; FALLBACK4-NEXT: movq %r9, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movq 16(%rdi), %rax
+; FALLBACK5-NEXT: movq 24(%rdi), %rdi
+; FALLBACK5-NEXT: movzbl (%rsi), %esi
+; FALLBACK5-NEXT: movl %esi, %ecx
+; FALLBACK5-NEXT: shlb $5, %cl
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: andb $6, %sil
+; FALLBACK5-NEXT: movzbl %sil, %eax
+; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK5-NEXT: movq %rdi, %r8
+; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK5-NEXT: movq %rax, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK5-NEXT: sarq %cl, %rsi
+; FALLBACK5-NEXT: movq %r10, 8(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movq 16(%rdi), %rcx
+; FALLBACK6-NEXT: movq 24(%rdi), %rdi
+; FALLBACK6-NEXT: movzbl (%rsi), %esi
+; FALLBACK6-NEXT: movl %esi, %eax
+; FALLBACK6-NEXT: shlb $5, %al
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rdi
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: andb $6, %sil
+; FALLBACK6-NEXT: movzbl %sil, %ecx
+; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK6-NEXT: notb %al
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK6-NEXT: orq %rsi, %rdi
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK6-NEXT: orq %r9, %rcx
+; FALLBACK6-NEXT: addq %r8, %r8
+; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK6-NEXT: orq %r10, %rax
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %rax, 8(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK6-NEXT: movq %rdi, (%rdx)
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movq 16(%rdi), %rax
+; FALLBACK7-NEXT: movq 24(%rdi), %rdi
+; FALLBACK7-NEXT: movzbl (%rsi), %esi
+; FALLBACK7-NEXT: movl %esi, %ecx
+; FALLBACK7-NEXT: shlb $5, %cl
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: andb $6, %sil
+; FALLBACK7-NEXT: movzbl %sil, %eax
+; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK7-NEXT: movq %rdi, %r8
+; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK7-NEXT: movq %rax, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK7-NEXT: movq %r10, 8(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rax, 24(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK8-NEXT: movq 16(%rdi), %rcx
+; FALLBACK8-NEXT: movq 24(%rdi), %rdi
+; FALLBACK8-NEXT: movzbl (%rsi), %esi
+; FALLBACK8-NEXT: movl %esi, %eax
+; FALLBACK8-NEXT: shlb $5, %al
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rdi
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: andb $6, %sil
+; FALLBACK8-NEXT: movzbl %sil, %r9d
+; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK8-NEXT: movq %r10, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r11, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r8, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK9-NEXT: movq 16(%rdi), %rax
+; FALLBACK9-NEXT: movq 24(%rdi), %rdi
+; FALLBACK9-NEXT: movzbl (%rsi), %esi
+; FALLBACK9-NEXT: movl %esi, %ecx
+; FALLBACK9-NEXT: shlb $5, %cl
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: andb $6, %sil
+; FALLBACK9-NEXT: movzbl %sil, %eax
+; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK9-NEXT: movq %rdi, %r8
+; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK9-NEXT: movq %rax, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK9-NEXT: sarq %cl, %rsi
+; FALLBACK9-NEXT: movq %r10, 8(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK10-NEXT: movq 16(%rdi), %rcx
+; FALLBACK10-NEXT: movq 24(%rdi), %rdi
+; FALLBACK10-NEXT: movzbl (%rsi), %esi
+; FALLBACK10-NEXT: movl %esi, %eax
+; FALLBACK10-NEXT: shlb $5, %al
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rdi
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: andb $6, %sil
+; FALLBACK10-NEXT: movzbl %sil, %ecx
+; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK10-NEXT: notb %al
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK10-NEXT: orq %rsi, %rdi
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK10-NEXT: orq %r9, %rcx
+; FALLBACK10-NEXT: addq %r8, %r8
+; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK10-NEXT: orq %r10, %rax
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %rax, 8(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK10-NEXT: movq %rdi, (%rdx)
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK11-NEXT: movq 16(%rdi), %rax
+; FALLBACK11-NEXT: movq 24(%rdi), %rdi
+; FALLBACK11-NEXT: movzbl (%rsi), %esi
+; FALLBACK11-NEXT: movl %esi, %ecx
+; FALLBACK11-NEXT: shlb $5, %cl
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: andb $6, %sil
+; FALLBACK11-NEXT: movzbl %sil, %eax
+; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK11-NEXT: movq %rdi, %r8
+; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK11-NEXT: movq %rax, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK11-NEXT: movq %r10, 8(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rax, 24(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK12-NEXT: movq 16(%rdi), %rcx
+; FALLBACK12-NEXT: movq 24(%rdi), %rdi
+; FALLBACK12-NEXT: movzbl (%rsi), %esi
+; FALLBACK12-NEXT: movl %esi, %eax
+; FALLBACK12-NEXT: shlb $5, %al
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rdi
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: andb $6, %sil
+; FALLBACK12-NEXT: movzbl %sil, %r9d
+; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10
+; FALLBACK12-NEXT: movq %r10, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r11, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r8, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK13-NEXT: movq 16(%rdi), %rax
+; FALLBACK13-NEXT: movq 24(%rdi), %rdi
+; FALLBACK13-NEXT: movzbl (%rsi), %esi
+; FALLBACK13-NEXT: movl %esi, %ecx
+; FALLBACK13-NEXT: shlb $5, %cl
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: andb $6, %sil
+; FALLBACK13-NEXT: movzbl %sil, %eax
+; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r8
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK13-NEXT: movq %rax, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK13-NEXT: sarq %cl, %rsi
+; FALLBACK13-NEXT: movq %r10, 8(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK14-NEXT: movq 16(%rdi), %rcx
+; FALLBACK14-NEXT: movq 24(%rdi), %rdi
+; FALLBACK14-NEXT: movzbl (%rsi), %esi
+; FALLBACK14-NEXT: movl %esi, %eax
+; FALLBACK14-NEXT: shlb $5, %al
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rdi
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: andb $6, %sil
+; FALLBACK14-NEXT: movzbl %sil, %ecx
+; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
+; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi
+; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8
+; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
+; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
+; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11
+; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
+; FALLBACK14-NEXT: notb %al
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
+; FALLBACK14-NEXT: orq %rsi, %rdi
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
+; FALLBACK14-NEXT: orq %r9, %rcx
+; FALLBACK14-NEXT: addq %r8, %r8
+; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
+; FALLBACK14-NEXT: orq %r10, %rax
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK14-NEXT: movq %rdi, (%rdx)
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
+; FALLBACK15-NEXT: movq 16(%rdi), %rax
+; FALLBACK15-NEXT: movq 24(%rdi), %rdi
+; FALLBACK15-NEXT: movzbl (%rsi), %esi
+; FALLBACK15-NEXT: movl %esi, %ecx
+; FALLBACK15-NEXT: shlb $5, %cl
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: andb $6, %sil
+; FALLBACK15-NEXT: movzbl %sil, %eax
+; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi
+; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi
+; FALLBACK15-NEXT: movq %rdi, %r8
+; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
+; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9
+; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax
+; FALLBACK15-NEXT: movq %rax, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
+; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax
+; FALLBACK15-NEXT: movq %r10, 8(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rax, 24(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: retq
+;
+; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: pushl %ebx
+; X86-SSE2-NEXT: pushl %edi
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: subl $92, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl (%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%eax), %edi
+; X86-SSE2-NEXT: movl 12(%eax), %ebx
+; X86-SSE2-NEXT: movl 16(%eax), %ebp
+; X86-SSE2-NEXT: movl 20(%eax), %esi
+; X86-SSE2-NEXT: movl 24(%eax), %edx
+; X86-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movzbl (%eax), %eax
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: sarl $31, %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-SSE2-NEXT: movl %edx, 28(%eax)
+; X86-SSE2-NEXT: movl %ebp, 16(%eax)
+; X86-SSE2-NEXT: movl %ebx, 20(%eax)
+; X86-SSE2-NEXT: movl %edi, 8(%eax)
+; X86-SSE2-NEXT: movl %esi, 12(%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, (%eax)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-SSE2-NEXT: addl $92, %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %edi
+; X86-SSE2-NEXT: popl %ebx
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: ashr_32bytes_dwordOff:
+; X86-SSE42: # %bb.0:
+; X86-SSE42-NEXT: pushl %ebx
+; X86-SSE42-NEXT: pushl %edi
+; X86-SSE42-NEXT: pushl %esi
+; X86-SSE42-NEXT: subl $64, %esp
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE42-NEXT: movups (%edx), %xmm0
+; X86-SSE42-NEXT: movl 16(%edx), %esi
+; X86-SSE42-NEXT: movl 20(%edx), %edi
+; X86-SSE42-NEXT: movl 24(%edx), %ebx
+; X86-SSE42-NEXT: movl 28(%edx), %edx
+; X86-SSE42-NEXT: movzbl (%ecx), %ecx
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: sarl $31, %edx
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1
+; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
+; X86-SSE42-NEXT: movups %xmm0, (%eax)
+; X86-SSE42-NEXT: addl $64, %esp
+; X86-SSE42-NEXT: popl %esi
+; X86-SSE42-NEXT: popl %edi
+; X86-SSE42-NEXT: popl %ebx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: ashr_32bytes_dwordOff:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: pushl %ebx
+; X86-AVX-NEXT: pushl %edi
+; X86-AVX-NEXT: pushl %esi
+; X86-AVX-NEXT: subl $64, %esp
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX-NEXT: vmovups (%edx), %xmm0
+; X86-AVX-NEXT: movl 16(%edx), %esi
+; X86-AVX-NEXT: movl 20(%edx), %edi
+; X86-AVX-NEXT: movl 24(%edx), %ebx
+; X86-AVX-NEXT: movl 28(%edx), %edx
+; X86-AVX-NEXT: movzbl (%ecx), %ecx
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, (%esp)
+; X86-AVX-NEXT: sarl $31, %edx
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: andl $7, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1
+; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX-NEXT: addl $64, %esp
+; X86-AVX-NEXT: popl %esi
+; X86-AVX-NEXT: popl %edi
+; X86-AVX-NEXT: popl %ebx
+; X86-AVX-NEXT: retl
+ %src = load i256, ptr %src.ptr, align 1
+ %dwordOff = load i256, ptr %dwordOff.ptr, align 1
+ %bitOff = shl i256 %dwordOff, 5
+ %res = ashr i256 %src, %bitOff
+ store i256 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
@@ -1446,18 +11832,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $31, %esi
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $3, %esi
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rax, (%rdx)
; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: ashr_32bytes:
+; X64-SSE42-LABEL: ashr_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movq 16(%rdi), %rax
@@ -1465,20 +11851,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movzbl (%rsi), %esi
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: sarq $63, %rcx
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $31, %esi
-; X64-SSE42-NEXT: movups -64(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rsi), %xmm1
+; X64-SSE42-NEXT: andl $3, %esi
+; X64-SSE42-NEXT: movups -72(%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT: movups -56(%rsp,%rsi,8), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: ashr_32bytes:
+; X64-AVX-LABEL: ashr_32bytes_qwordOff:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-AVX-NEXT: movq 16(%rdi), %rax
@@ -1486,31 +11872,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX-NEXT: movzbl (%rsi), %esi
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: sarq $63, %rcx
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $31, %esi
-; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1
+; X64-AVX-NEXT: andl $3, %esi
+; X64-AVX-NEXT: vmovups -72(%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT: vmovups -56(%rsp,%rsi,8), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_32bytes:
+; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $72, %esp
+; X86-SSE2-NEXT: subl $92, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 8(%eax), %edi
; X86-SSE2-NEXT: movl 12(%eax), %ebx
; X86-SSE2-NEXT: movl 16(%eax), %ebp
@@ -1525,7 +11911,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -1538,17 +11924,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $31, %eax
-; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $3, %eax
+; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 24(%eax)
; X86-SSE2-NEXT: movl %edx, 28(%eax)
@@ -1558,16 +11944,16 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %esi, 12(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, (%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $72, %esp
+; X86-SSE2-NEXT: addl $92, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: ashr_32bytes:
+; X86-SSE42-LABEL: ashr_32bytes_qwordOff:
; X86-SSE42: # %bb.0:
; X86-SSE42-NEXT: pushl %ebx
; X86-SSE42-NEXT: pushl %edi
@@ -1586,7 +11972,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
; X86-SSE42-NEXT: sarl $31, %edx
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -1596,9 +11982,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $31, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
+; X86-SSE42-NEXT: andl $3, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
; X86-SSE42-NEXT: addl $64, %esp
@@ -1607,7 +11993,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_32bytes:
+; X86-AVX-LABEL: ashr_32bytes_qwordOff:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: pushl %ebx
; X86-AVX-NEXT: pushl %edi
@@ -1626,7 +12012,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm0, (%esp)
+; X86-AVX-NEXT: vmovaps %xmm0, (%esp)
; X86-AVX-NEXT: sarl $31, %edx
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -1636,9 +12022,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $31, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
+; X86-AVX-NEXT: andl $3, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX-NEXT: vmovups %xmm0, (%eax)
; X86-AVX-NEXT: addl $64, %esp
@@ -1647,15 +12033,3662 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: popl %ebx
; X86-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
- %byteOff = load i256, ptr %byteOff.ptr, align 1
- %bitOff = shl i256 %byteOff, 3
+ %qwordOff = load i256, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i256 %qwordOff, 6
%res = ashr i256 %src, %bitOff
store i256 %res, ptr %dst, align 1
ret void
}
define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: lshr_64bytes:
+; FALLBACK0-LABEL: lshr_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rax
+; FALLBACK0-NEXT: movq 8(%rdi), %rcx
+; FALLBACK0-NEXT: movq 16(%rdi), %r8
+; FALLBACK0-NEXT: movq 24(%rdi), %r9
+; FALLBACK0-NEXT: movq 32(%rdi), %r10
+; FALLBACK0-NEXT: movq 40(%rdi), %r11
+; FALLBACK0-NEXT: movq 48(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %r14
+; FALLBACK0-NEXT: movl (%rsi), %edi
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rdi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %edi
+; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT: movq %r8, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %r8, %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r10, %r8
+; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq %r10, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: leaq (%r14,%r14), %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r10, %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r12
+; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: addq %rbx, %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: orq %r14, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: orq %r13, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: lshr_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r15
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: movq (%rdi), %rcx
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %r10
+; FALLBACK1-NEXT: movq 32(%rdi), %r11
+; FALLBACK1-NEXT: movq 40(%rdi), %rbx
+; FALLBACK1-NEXT: movq 48(%rdi), %r14
+; FALLBACK1-NEXT: movq 56(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %eax
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rax,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %eax
+; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq %r9, %r8
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq %r11, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK1-NEXT: movq %r14, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shrq %cl, %rax
+; FALLBACK1-NEXT: movq %r11, 48(%rdx)
+; FALLBACK1-NEXT: movq %rax, 56(%rdx)
+; FALLBACK1-NEXT: movq %r10, 32(%rdx)
+; FALLBACK1-NEXT: movq %r15, 40(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r8, 8(%rdx)
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: popq %r15
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: lshr_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %r10
+; FALLBACK2-NEXT: movq 32(%rdi), %r11
+; FALLBACK2-NEXT: movq 40(%rdi), %rbx
+; FALLBACK2-NEXT: movq 48(%rdi), %r14
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %eax
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rax,8), %ecx
+; FALLBACK2-NEXT: andl $56, %ecx
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13
+; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8
+; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT: movl %ecx, %r12d
+; FALLBACK2-NEXT: notb %r12b
+; FALLBACK2-NEXT: addq %r9, %r9
+; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT: orq %rbx, %r9
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r13, %rdi
+; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT: addq %r10, %r10
+; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT: orq %r8, %r10
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT: orq %r15, %r8
+; FALLBACK2-NEXT: addq %r14, %r14
+; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT: orq %rbp, %r11
+; FALLBACK2-NEXT: addq %rax, %rax
+; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT: orq %r13, %rax
+; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT: movq %rax, 48(%rdx)
+; FALLBACK2-NEXT: movq %r11, 32(%rdx)
+; FALLBACK2-NEXT: movq %r8, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %r10, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, (%rdx)
+; FALLBACK2-NEXT: movq %r9, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: lshr_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r15
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: movq (%rdi), %rcx
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %r10
+; FALLBACK3-NEXT: movq 32(%rdi), %r11
+; FALLBACK3-NEXT: movq 40(%rdi), %rbx
+; FALLBACK3-NEXT: movq 48(%rdi), %r14
+; FALLBACK3-NEXT: movq 56(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %eax
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rax,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %eax
+; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq %r9, %r8
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq %r11, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK3-NEXT: movq %r14, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT: movq %r11, 48(%rdx)
+; FALLBACK3-NEXT: movq %r10, 32(%rdx)
+; FALLBACK3-NEXT: movq %r15, 40(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r8, 8(%rdx)
+; FALLBACK3-NEXT: movq %rax, 56(%rdx)
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: popq %r15
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: lshr_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbp
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: pushq %rax
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT: movl (%rsi), %r8d
+; FALLBACK4-NEXT: xorps %xmm4, %xmm4
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%r8,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %r8d
+; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rdi
+; FALLBACK4-NEXT: orq %r10, %rdi
+; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10
+; FALLBACK4-NEXT: movq %r10, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12
+; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r14, %r10
+; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14
+; FALLBACK4-NEXT: movq %r14, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r13
+; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp
+; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: orq %r13, %r15
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: addq %r14, %r14
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: orq %r12, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbp
+; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8
+; FALLBACK4-NEXT: leaq (%r8,%r8), %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: orq %rbp, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: addq %rbx, %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r9, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: movq %r8, 56(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT: movq %r12, 48(%rdx)
+; FALLBACK4-NEXT: movq %r14, 32(%rdx)
+; FALLBACK4-NEXT: movq %r15, 40(%rdx)
+; FALLBACK4-NEXT: movq %r10, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %rdi, (%rdx)
+; FALLBACK4-NEXT: addq $8, %rsp
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: popq %rbp
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: lshr_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: xorps %xmm4, %xmm4
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq %r10, %r8
+; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT: movq %r11, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shrq %cl, %r11
+; FALLBACK5-NEXT: movq %r15, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 48(%rdx)
+; FALLBACK5-NEXT: movq %r11, 56(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r14, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: lshr_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: xorps %xmm4, %xmm4
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %esi
+; FALLBACK6-NEXT: andl $56, %esi
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT: movl %esi, %ebx
+; FALLBACK6-NEXT: notb %bl
+; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT: orq %r11, %r8
+; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT: orq %r12, %r11
+; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r9, %rdi
+; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT: orq %r14, %r9
+; FALLBACK6-NEXT: addq %r10, %r10
+; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT: orq %r15, %r10
+; FALLBACK6-NEXT: addq %rax, %rax
+; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT: orq %r13, %rax
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT: orq %rbp, %rcx
+; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %r10, 32(%rdx)
+; FALLBACK6-NEXT: movq %r9, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, (%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: lshr_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: xorps %xmm4, %xmm4
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT: movq %r10, %r8
+; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT: movq %r11, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT: movq %r15, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 48(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r14, (%rdx)
+; FALLBACK7-NEXT: movq %r10, 56(%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: lshr_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbp
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: pushq %rax
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT: movl (%rsi), %r9d
+; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%r9,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %r9d
+; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rdi
+; FALLBACK8-NEXT: orq %r10, %rdi
+; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK8-NEXT: movq %r10, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12
+; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r14, %r10
+; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14
+; FALLBACK8-NEXT: movq %r14, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r13
+; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp
+; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: orq %r13, %r15
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: addq %r14, %r14
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: orq %r12, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbp
+; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK8-NEXT: leaq (%r9,%r9), %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: orq %rbp, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: addq %rbx, %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r8, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: movq %r9, 56(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT: movq %r12, 48(%rdx)
+; FALLBACK8-NEXT: movq %r14, 32(%rdx)
+; FALLBACK8-NEXT: movq %r15, 40(%rdx)
+; FALLBACK8-NEXT: movq %r10, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %rdi, (%rdx)
+; FALLBACK8-NEXT: addq $8, %rsp
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: popq %rbp
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: lshr_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq %r10, %r8
+; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT: movq %r11, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shrq %cl, %r11
+; FALLBACK9-NEXT: movq %r15, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 48(%rdx)
+; FALLBACK9-NEXT: movq %r11, 56(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r14, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: lshr_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %esi
+; FALLBACK10-NEXT: andl $56, %esi
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT: movl %esi, %ebx
+; FALLBACK10-NEXT: notb %bl
+; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT: orq %r11, %r8
+; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT: orq %r12, %r11
+; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r9, %rdi
+; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT: orq %r14, %r9
+; FALLBACK10-NEXT: addq %r10, %r10
+; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT: orq %r15, %r10
+; FALLBACK10-NEXT: addq %rax, %rax
+; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT: orq %r13, %rax
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT: orq %rbp, %rcx
+; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %r10, 32(%rdx)
+; FALLBACK10-NEXT: movq %r9, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, (%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: lshr_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT: movq %r10, %r8
+; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT: movq %r11, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT: movq %r15, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 48(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r14, (%rdx)
+; FALLBACK11-NEXT: movq %r10, 56(%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: lshr_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbp
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: pushq %rax
+; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT: movl (%rsi), %r9d
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%r9,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %r9d
+; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rdi
+; FALLBACK12-NEXT: orq %r10, %rdi
+; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10
+; FALLBACK12-NEXT: movq %r10, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12
+; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14
+; FALLBACK12-NEXT: movq %r14, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r13
+; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp
+; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: orq %r13, %r15
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: addq %r14, %r14
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: orq %r12, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbp
+; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9
+; FALLBACK12-NEXT: leaq (%r9,%r9), %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: orq %rbp, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: addq %rbx, %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r8, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: movq %r9, 56(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT: movq %r12, 48(%rdx)
+; FALLBACK12-NEXT: movq %r14, 32(%rdx)
+; FALLBACK12-NEXT: movq %r15, 40(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %rdi, (%rdx)
+; FALLBACK12-NEXT: addq $8, %rsp
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: popq %rbp
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: lshr_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT: movl (%rsi), %edi
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rdi,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %edi
+; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi
+; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9
+; FALLBACK13-NEXT: movq %r9, %rax
+; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax
+; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10
+; FALLBACK13-NEXT: movq %r10, %r8
+; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9
+; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11
+; FALLBACK13-NEXT: movq %r11, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi
+; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11
+; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14
+; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi
+; FALLBACK13-NEXT: movq %rdi, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shrq %cl, %r11
+; FALLBACK13-NEXT: movq %r15, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 48(%rdx)
+; FALLBACK13-NEXT: movq %r11, 56(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 32(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rax, 24(%rdx)
+; FALLBACK13-NEXT: movq %r14, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: lshr_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT: movl (%rsi), %esi
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: andl $56, %esi
+; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11
+; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax
+; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12
+; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9
+; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10
+; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14
+; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15
+; FALLBACK14-NEXT: movl %ecx, %ebx
+; FALLBACK14-NEXT: notb %bl
+; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp
+; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT: orq %r11, %r8
+; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT: orq %r12, %r11
+; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12
+; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13
+; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp
+; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r9, %rdi
+; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT: orq %r14, %r9
+; FALLBACK14-NEXT: addq %r10, %r10
+; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT: orq %r15, %r10
+; FALLBACK14-NEXT: addq %rsi, %rsi
+; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT: orq %r13, %rsi
+; FALLBACK14-NEXT: addq %rax, %rax
+; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT: orq %rbp, %rax
+; FALLBACK14-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK14-NEXT: movq %rax, 8(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 48(%rdx)
+; FALLBACK14-NEXT: movq %r10, 32(%rdx)
+; FALLBACK14-NEXT: movq %r9, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, (%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: lshr_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT: movq %r10, %r8
+; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT: movq %r11, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT: movq %r15, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 48(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r14, (%rdx)
+; FALLBACK15-NEXT: movq %r10, 56(%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: lshr_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%eax), %ebp
+; FALLBACK16-NEXT: movl 44(%eax), %ebx
+; FALLBACK16-NEXT: movl 48(%eax), %edi
+; FALLBACK16-NEXT: movl 52(%eax), %esi
+; FALLBACK16-NEXT: movl 56(%eax), %edx
+; FALLBACK16-NEXT: movl 60(%eax), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, %esi
+; FALLBACK16-NEXT: andl $60, %esi
+; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT: shll $3, %eax
+; FALLBACK16-NEXT: andl $24, %eax
+; FALLBACK16-NEXT: movl %edx, %edi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT: movb %al, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %edi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK16-NEXT: movl %edx, %ebp
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %ebp, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %eax, %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %eax, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %ebp, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %ebp, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %edx, %edi
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK16-NEXT: movl %esi, %ebx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK16-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT: shrl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 60(%eax)
+; FALLBACK16-NEXT: movl %edx, 56(%eax)
+; FALLBACK16-NEXT: movl %esi, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl %edi, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: lshr_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%ecx), %ebp
+; FALLBACK17-NEXT: movl 44(%ecx), %ebx
+; FALLBACK17-NEXT: movl 48(%ecx), %edi
+; FALLBACK17-NEXT: movl 52(%ecx), %esi
+; FALLBACK17-NEXT: movl 56(%ecx), %edx
+; FALLBACK17-NEXT: movl 60(%ecx), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %edx
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: shrl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 60(%ebp)
+; FALLBACK17-NEXT: movl %esi, 48(%ebp)
+; FALLBACK17-NEXT: movl %edi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: lshr_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebp
+; FALLBACK18-NEXT: movl 44(%eax), %ebx
+; FALLBACK18-NEXT: movl 48(%eax), %edi
+; FALLBACK18-NEXT: movl 52(%eax), %esi
+; FALLBACK18-NEXT: movl 56(%eax), %edx
+; FALLBACK18-NEXT: movl 60(%eax), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %eax
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: leal (,%eax,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ecx
+; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl %ecx, %edi
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %eax, %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %edx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebx, 56(%eax)
+; FALLBACK18-NEXT: movl %edi, 48(%eax)
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl %esi, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: lshr_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $188, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%ecx), %eax
+; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%ecx), %ebp
+; FALLBACK19-NEXT: movl 44(%ecx), %ebx
+; FALLBACK19-NEXT: movl 48(%ecx), %edi
+; FALLBACK19-NEXT: movl 52(%ecx), %esi
+; FALLBACK19-NEXT: movl 56(%ecx), %edx
+; FALLBACK19-NEXT: movl 60(%ecx), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %ecx
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, %ebp
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shll $3, %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 56(%ebp)
+; FALLBACK19-NEXT: movl %esi, 48(%ebp)
+; FALLBACK19-NEXT: movl %edx, 52(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 44(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 32(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 36(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 16(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT: movl %eax, 60(%ebp)
+; FALLBACK19-NEXT: addl $188, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: lshr_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: xorps %xmm4, %xmm4
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %esi
+; FALLBACK20-NEXT: andl $60, %esi
+; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %edi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %eax, %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT: leal (%edx,%edx), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %eax, %edx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 60(%eax)
+; FALLBACK20-NEXT: movl %edx, 56(%eax)
+; FALLBACK20-NEXT: movl %esi, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl %edi, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: lshr_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT: movl (%eax), %ecx
+; FALLBACK21-NEXT: xorps %xmm4, %xmm4
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: shrl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 60(%ebp)
+; FALLBACK21-NEXT: movl %esi, 48(%ebp)
+; FALLBACK21-NEXT: movl %edi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: lshr_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT: movl (%eax), %ecx
+; FALLBACK22-NEXT: xorps %xmm4, %xmm4
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: leal (,%ecx,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK22-NEXT: orl %edi, %ebp
+; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edi, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %ecx, %eax
+; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax
+; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx
+; FALLBACK22-NEXT: addl %ebp, %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %edx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebx, 56(%eax)
+; FALLBACK22-NEXT: movl %edi, 48(%eax)
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl %esi, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: lshr_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $188, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT: movl (%eax), %ecx
+; FALLBACK23-NEXT: xorps %xmm4, %xmm4
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ecx, %ebp
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shll $3, %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %esi
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl %edi, %edx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %eax, 56(%ebp)
+; FALLBACK23-NEXT: movl %esi, 48(%ebp)
+; FALLBACK23-NEXT: movl %edx, 52(%ebp)
+; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 44(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 32(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 36(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 24(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 28(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 16(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 20(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ebp)
+; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, (%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT: movl %eax, 60(%ebp)
+; FALLBACK23-NEXT: addl $188, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: lshr_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT: movl (%eax), %ecx
+; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, %esi
+; FALLBACK24-NEXT: andl $60, %esi
+; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT: shll $3, %ecx
+; FALLBACK24-NEXT: andl $24, %ecx
+; FALLBACK24-NEXT: movl %edx, %edi
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK24-NEXT: movl %ecx, %ebp
+; FALLBACK24-NEXT: movb %cl, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %ebp, %eax
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %edi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %eax, %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT: leal (%edx,%edx), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %eax, %edx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 60(%eax)
+; FALLBACK24-NEXT: movl %edx, 56(%eax)
+; FALLBACK24-NEXT: movl %esi, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl %edi, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: lshr_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT: movl (%eax), %ecx
+; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: shrl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 60(%ebp)
+; FALLBACK25-NEXT: movl %esi, 48(%ebp)
+; FALLBACK25-NEXT: movl %edi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: lshr_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT: movl (%eax), %ecx
+; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: leal (,%ecx,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %edi
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK26-NEXT: orl %edi, %ebp
+; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi
+; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax
+; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT: orl %edi, %esi
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx
+; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx
+; FALLBACK26-NEXT: addl %ecx, %ecx
+; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: movl %edx, 60(%ecx)
+; FALLBACK26-NEXT: movl %ebx, 56(%ecx)
+; FALLBACK26-NEXT: movl %edi, 48(%ecx)
+; FALLBACK26-NEXT: movl %esi, 52(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 40(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 44(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 32(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 36(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 24(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 28(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 16(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 20(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 8(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 12(%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, (%ecx)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: movl %eax, 4(%ecx)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: lshr_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $188, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT: movl (%eax), %ecx
+; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ecx, %ebp
+; FALLBACK27-NEXT: andl $60, %ebp
+; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shll $3, %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %esi
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl %edi, %edx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %eax, 56(%ebp)
+; FALLBACK27-NEXT: movl %esi, 48(%ebp)
+; FALLBACK27-NEXT: movl %edx, 52(%ebp)
+; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 44(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 32(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 36(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 24(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 28(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 16(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 20(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ebp)
+; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, (%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT: movl %eax, 60(%ebp)
+; FALLBACK27-NEXT: addl $188, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: lshr_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT: movl (%eax), %ecx
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, %esi
+; FALLBACK28-NEXT: andl $60, %esi
+; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT: shll $3, %ecx
+; FALLBACK28-NEXT: andl $24, %ecx
+; FALLBACK28-NEXT: movl %edx, %edi
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebx
+; FALLBACK28-NEXT: movl %ecx, %ebp
+; FALLBACK28-NEXT: movb %cl, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %ebp, %eax
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %edi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %eax, %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT: leal (%edx,%edx), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %eax, %edx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 60(%eax)
+; FALLBACK28-NEXT: movl %edx, 56(%eax)
+; FALLBACK28-NEXT: movl %esi, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl %edi, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: lshr_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT: movl (%eax), %ecx
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: shrl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 60(%ebp)
+; FALLBACK29-NEXT: movl %esi, 48(%ebp)
+; FALLBACK29-NEXT: movl %edi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: lshr_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT: movl (%eax), %edx
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: leal (,%edx,8), %ecx
+; FALLBACK30-NEXT: andl $24, %ecx
+; FALLBACK30-NEXT: andl $60, %edx
+; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
+; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi
+; FALLBACK30-NEXT: movl %ecx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp
+; FALLBACK30-NEXT: orl %edi, %ebp
+; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi
+; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi
+; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi
+; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax
+; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi
+; FALLBACK30-NEXT: orl %edi, %esi
+; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax
+; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx
+; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp
+; FALLBACK30-NEXT: leal (%edx,%edx), %ecx
+; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx
+; FALLBACK30-NEXT: orl %eax, %edx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: movl %ebp, 60(%ecx)
+; FALLBACK30-NEXT: movl %edx, 56(%ecx)
+; FALLBACK30-NEXT: movl %edi, 48(%ecx)
+; FALLBACK30-NEXT: movl %esi, 52(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 40(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 44(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 32(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 36(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 24(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 28(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 16(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 20(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 8(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 12(%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, (%ecx)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: movl %eax, 4(%ecx)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: lshr_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $188, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT: movl (%eax), %ecx
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ecx, %ebp
+; FALLBACK31-NEXT: andl $60, %ebp
+; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shll $3, %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %esi
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl %edi, %edx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %eax, 56(%ebp)
+; FALLBACK31-NEXT: movl %esi, 48(%ebp)
+; FALLBACK31-NEXT: movl %edx, 52(%ebp)
+; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 44(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 32(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 36(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 24(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 28(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 16(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 20(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ebp)
+; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, (%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT: movl %eax, 60(%ebp)
+; FALLBACK31-NEXT: addl $188, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i512, ptr %src.ptr, align 1
+ %byteOff = load i512, ptr %byteOff.ptr, align 1
+ %bitOff = shl i512 %byteOff, 3
+ %res = lshr i512 %src, %bitOff
+ store i512 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: lshr_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rbx
; X64-SSE2-NEXT: movq (%rdi), %rax
@@ -1667,6 +15700,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq 48(%rdi), %rbx
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-SSE2-NEXT: movl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -1675,23 +15713,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $7, %esi
+; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
@@ -1703,35 +15733,38 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: popq %rbx
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: lshr_64bytes:
+; X64-SSE42-LABEL: lshr_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rax
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
; X64-SSE42-NEXT: movl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %eax
-; X64-SSE42-NEXT: movups -128(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: andl $7, %eax
+; X64-SSE42-NEXT: movups -128(%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT: movups -112(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT: movups -96(%rsp,%rax,8), %xmm2
+; X64-SSE42-NEXT: movups -80(%rsp,%rax,8), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX1-LABEL: lshr_64bytes:
+; X64-AVX1-LABEL: lshr_64bytes_qwordOff:
; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT: movl (%rsi), %eax
@@ -1740,44 +15773,47 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: andl $63, %eax
-; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT: andl $7, %eax
+; X64-AVX1-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
+; X64-AVX1-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
+; X64-AVX1-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
+; X64-AVX1-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT: popq %rax
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: lshr_64bytes:
+; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: pushq %rax
; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-AVX512-NEXT: movl (%rsi), %eax
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $63, %eax
-; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT: andl $7, %eax
+; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
+; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
+; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
+; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT: popq %rax
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
;
-; X86-SSE2-LABEL: lshr_64bytes:
+; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
+; X86-SSE2-NEXT: subl $188, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1798,7 +15834,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 32(%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 40(%eax), %ebp
; X86-SSE2-NEXT: movl 44(%eax), %ebx
; X86-SSE2-NEXT: movl 48(%eax), %edi
@@ -1807,13 +15843,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 60(%eax), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %eax
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -1821,6 +15861,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1833,49 +15874,33 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 56(%eax)
; X86-SSE2-NEXT: movl %edx, 60(%eax)
@@ -1883,7 +15908,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, 52(%eax)
; X86-SSE2-NEXT: movl %ebx, 40(%eax)
; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 32(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 36(%eax)
@@ -1903,16 +15928,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, (%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
+; X86-SSE2-NEXT: addl $188, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: lshr_64bytes:
+; X86-SSE42-LABEL: lshr_64bytes_qwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $128, %esp
+; X86-SSE42-NEXT: subl $140, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1922,29 +15947,29 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 48(%edx), %xmm3
; X86-SSE42-NEXT: movl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm4, %xmm4
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2
+; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3
; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
+; X86-SSE42-NEXT: addl $140, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX1-LABEL: lshr_64bytes:
+; X86-AVX1-LABEL: lshr_64bytes_qwordOff:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $128, %esp
+; X86-AVX1-NEXT: subl $140, %esp
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1956,22 +15981,22 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX1-NEXT: andl $63, %ecx
-; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX1-NEXT: andl $7, %ecx
+; X86-AVX1-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX1-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX1-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX1-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $128, %esp
+; X86-AVX1-NEXT: addl $140, %esp
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: lshr_64bytes:
+; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $128, %esp
+; X86-AVX512-NEXT: subl $140, %esp
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -1980,27 +16005,3801 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
; X86-AVX512-NEXT: vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT: andl $63, %ecx
-; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX512-NEXT: andl $7, %ecx
+; X86-AVX512-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX512-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX512-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX512-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $128, %esp
+; X86-AVX512-NEXT: addl $140, %esp
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
- %byteOff = load i512, ptr %byteOff.ptr, align 1
- %bitOff = shl i512 %byteOff, 3
+ %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i512 %qwordOff, 6
%res = lshr i512 %src, %bitOff
store i512 %res, ptr %dst, align 1
ret void
}
+
define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: shl_64bytes:
+; FALLBACK0-LABEL: shl_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rax
+; FALLBACK0-NEXT: movq 8(%rdi), %rcx
+; FALLBACK0-NEXT: movq 16(%rdi), %r8
+; FALLBACK0-NEXT: movq 24(%rdi), %r9
+; FALLBACK0-NEXT: movq 32(%rdi), %r10
+; FALLBACK0-NEXT: movq 40(%rdi), %r11
+; FALLBACK0-NEXT: movq 48(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %rdi
+; FALLBACK0-NEXT: movl (%rsi), %esi
+; FALLBACK0-NEXT: xorps %xmm0, %xmm0
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rsi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %esi
+; FALLBACK0-NEXT: negl %esi
+; FALLBACK0-NEXT: movslq %esi, %rbx
+; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8
+; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi
+; FALLBACK0-NEXT: movq %rdi, %r10
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq %r8, %r9
+; FALLBACK0-NEXT: shrq %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r9
+; FALLBACK0-NEXT: orq %r10, %r9
+; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10
+; FALLBACK0-NEXT: movq %r10, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15
+; FALLBACK0-NEXT: movq %r15, %r11
+; FALLBACK0-NEXT: shrq %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: orq %r14, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: shrq %rdi
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rdi
+; FALLBACK0-NEXT: orq %r15, %rdi
+; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14
+; FALLBACK0-NEXT: movq %r14, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r12
+; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13
+; FALLBACK0-NEXT: movq %r13, %r15
+; FALLBACK0-NEXT: shrq %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r13
+; FALLBACK0-NEXT: shrq %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: orq %r13, %r10
+; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r12
+; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r13
+; FALLBACK0-NEXT: shrq %r13
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: orq %r12, %r13
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: shrq %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: orq %rbx, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %r13, 56(%rdx)
+; FALLBACK0-NEXT: movq %r10, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: shl_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: pushq %rax
+; FALLBACK1-NEXT: movq (%rdi), %rax
+; FALLBACK1-NEXT: movq 8(%rdi), %rcx
+; FALLBACK1-NEXT: movq 16(%rdi), %r8
+; FALLBACK1-NEXT: movq 24(%rdi), %r9
+; FALLBACK1-NEXT: movq 32(%rdi), %r10
+; FALLBACK1-NEXT: movq 40(%rdi), %r11
+; FALLBACK1-NEXT: movq 48(%rdi), %rbx
+; FALLBACK1-NEXT: movq 56(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %esi
+; FALLBACK1-NEXT: xorps %xmm0, %xmm0
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %esi
+; FALLBACK1-NEXT: negl %esi
+; FALLBACK1-NEXT: movslq %esi, %r9
+; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax
+; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10
+; FALLBACK1-NEXT: movq %r10, %rsi
+; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8
+; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi
+; FALLBACK1-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx
+; FALLBACK1-NEXT: movq %rbx, %r14
+; FALLBACK1-NEXT: shldq %cl, %r11, %r14
+; FALLBACK1-NEXT: shldq %cl, %r10, %r11
+; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10
+; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK1-NEXT: shldq %cl, %r10, %r9
+; FALLBACK1-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK1-NEXT: shldq %cl, %r8, %rdi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: shlq %cl, %r8
+; FALLBACK1-NEXT: movq %r10, 48(%rdx)
+; FALLBACK1-NEXT: movq %r9, 56(%rdx)
+; FALLBACK1-NEXT: movq %r11, 32(%rdx)
+; FALLBACK1-NEXT: movq %r14, 40(%rdx)
+; FALLBACK1-NEXT: movq %rax, 16(%rdx)
+; FALLBACK1-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK1-NEXT: movq %r8, (%rdx)
+; FALLBACK1-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK1-NEXT: addq $8, %rsp
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: shl_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq (%rdi), %rax
+; FALLBACK2-NEXT: movq 8(%rdi), %rcx
+; FALLBACK2-NEXT: movq 16(%rdi), %r8
+; FALLBACK2-NEXT: movq 24(%rdi), %r9
+; FALLBACK2-NEXT: movq 32(%rdi), %r10
+; FALLBACK2-NEXT: movq 40(%rdi), %r11
+; FALLBACK2-NEXT: movq 48(%rdi), %rbx
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %esi
+; FALLBACK2-NEXT: xorps %xmm0, %xmm0
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rsi,8), %eax
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: andl $56, %esi
+; FALLBACK2-NEXT: negl %esi
+; FALLBACK2-NEXT: movslq %esi, %rsi
+; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10
+; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx
+; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9
+; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
+; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
+; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14
+; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx
+; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8
+; FALLBACK2-NEXT: shlxq %rax, %r8, %r15
+; FALLBACK2-NEXT: shlxq %rax, %r10, %r12
+; FALLBACK2-NEXT: movl %eax, %r13d
+; FALLBACK2-NEXT: notb %r13b
+; FALLBACK2-NEXT: shrq %r10
+; FALLBACK2-NEXT: shrxq %r13, %r10, %r10
+; FALLBACK2-NEXT: orq %r9, %r10
+; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9
+; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp
+; FALLBACK2-NEXT: shrq %r14
+; FALLBACK2-NEXT: shrxq %r13, %r14, %r14
+; FALLBACK2-NEXT: orq %r11, %r14
+; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
+; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi
+; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax
+; FALLBACK2-NEXT: shrq %rcx
+; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx
+; FALLBACK2-NEXT: orq %rbx, %rcx
+; FALLBACK2-NEXT: shrq %r9
+; FALLBACK2-NEXT: shrxq %r13, %r9, %r9
+; FALLBACK2-NEXT: orq %r15, %r9
+; FALLBACK2-NEXT: shrq %rdi
+; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi
+; FALLBACK2-NEXT: orq %rbp, %rdi
+; FALLBACK2-NEXT: shrq %rsi
+; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: shrq %r8
+; FALLBACK2-NEXT: shrxq %r13, %r8, %r8
+; FALLBACK2-NEXT: orq %rax, %r8
+; FALLBACK2-NEXT: movq %r12, (%rdx)
+; FALLBACK2-NEXT: movq %r8, 48(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK2-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK2-NEXT: movq %r9, 40(%rdx)
+; FALLBACK2-NEXT: movq %rcx, 16(%rdx)
+; FALLBACK2-NEXT: movq %r14, 24(%rdx)
+; FALLBACK2-NEXT: movq %r10, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: shl_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: pushq %rax
+; FALLBACK3-NEXT: movq (%rdi), %rax
+; FALLBACK3-NEXT: movq 8(%rdi), %rcx
+; FALLBACK3-NEXT: movq 16(%rdi), %r8
+; FALLBACK3-NEXT: movq 24(%rdi), %r9
+; FALLBACK3-NEXT: movq 32(%rdi), %r10
+; FALLBACK3-NEXT: movq 40(%rdi), %r11
+; FALLBACK3-NEXT: movq 48(%rdi), %rbx
+; FALLBACK3-NEXT: movq 56(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %esi
+; FALLBACK3-NEXT: xorps %xmm0, %xmm0
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %esi
+; FALLBACK3-NEXT: negl %esi
+; FALLBACK3-NEXT: movslq %esi, %r8
+; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax
+; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9
+; FALLBACK3-NEXT: movq %r9, %rsi
+; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10
+; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi
+; FALLBACK3-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11
+; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx
+; FALLBACK3-NEXT: movq %rbx, %r14
+; FALLBACK3-NEXT: shldq %cl, %r11, %r14
+; FALLBACK3-NEXT: shldq %cl, %r9, %r11
+; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9
+; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK3-NEXT: shldq %cl, %r9, %r8
+; FALLBACK3-NEXT: shldq %cl, %rbx, %r9
+; FALLBACK3-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx
+; FALLBACK3-NEXT: movq %r9, 48(%rdx)
+; FALLBACK3-NEXT: movq %r8, 56(%rdx)
+; FALLBACK3-NEXT: movq %r11, 32(%rdx)
+; FALLBACK3-NEXT: movq %r14, 40(%rdx)
+; FALLBACK3-NEXT: movq %rax, 16(%rdx)
+; FALLBACK3-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK3-NEXT: movq %rcx, (%rdx)
+; FALLBACK3-NEXT: movq %rdi, 8(%rdx)
+; FALLBACK3-NEXT: addq $8, %rsp
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: shl_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK4-NEXT: movl (%rsi), %ecx
+; FALLBACK4-NEXT: xorps %xmm4, %xmm4
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%rcx,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %ecx
+; FALLBACK4-NEXT: negl %ecx
+; FALLBACK4-NEXT: movslq %ecx, %r9
+; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi
+; FALLBACK4-NEXT: movq %rdi, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK4-NEXT: movq %r11, %r8
+; FALLBACK4-NEXT: shrq %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r8
+; FALLBACK4-NEXT: orq %r10, %r8
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r10
+; FALLBACK4-NEXT: shrq %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: orq %r11, %r10
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK4-NEXT: movq %r15, %r11
+; FALLBACK4-NEXT: shrq %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK4-NEXT: movq %r12, %rbx
+; FALLBACK4-NEXT: shrq %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: orq %r15, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: movq %r14, %r15
+; FALLBACK4-NEXT: shrq %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r15
+; FALLBACK4-NEXT: orq %r12, %r15
+; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK4-NEXT: movq %r12, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r13
+; FALLBACK4-NEXT: shrq %rdi
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rdi
+; FALLBACK4-NEXT: orq %r13, %rdi
+; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r9
+; FALLBACK4-NEXT: shrq %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: orq %r9, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: movq %r14, (%rdx)
+; FALLBACK4-NEXT: movq %r12, 56(%rdx)
+; FALLBACK4-NEXT: movq %rdi, 48(%rdx)
+; FALLBACK4-NEXT: movq %r15, 8(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %r10, 32(%rdx)
+; FALLBACK4-NEXT: movq %r8, 40(%rdx)
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: shl_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: xorps %xmm4, %xmm4
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: negl %eax
+; FALLBACK5-NEXT: movslq %eax, %r8
+; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK5-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK5-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK5-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK5-NEXT: movq %r14, %r15
+; FALLBACK5-NEXT: shldq %cl, %r9, %r15
+; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK5-NEXT: shldq %cl, %r14, %r8
+; FALLBACK5-NEXT: movq %r11, %r9
+; FALLBACK5-NEXT: shlq %cl, %r9
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK5-NEXT: movq %r8, 56(%rdx)
+; FALLBACK5-NEXT: movq %r15, 48(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK5-NEXT: movq %r10, 16(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK5-NEXT: movq %rax, 32(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK5-NEXT: movq %r9, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: shl_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: subq $24, %rsp
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: xorps %xmm4, %xmm4
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm3, (%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %ecx
+; FALLBACK6-NEXT: andl $56, %ecx
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: negl %eax
+; FALLBACK6-NEXT: movslq %eax, %rsi
+; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax
+; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12
+; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi
+; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15
+; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13
+; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8
+; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11
+; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14
+; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx
+; FALLBACK6-NEXT: movl %ecx, %r9d
+; FALLBACK6-NEXT: notb %r9b
+; FALLBACK6-NEXT: shrq %rdi
+; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r12, %rdi
+; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp
+; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8
+; FALLBACK6-NEXT: shrq %r13
+; FALLBACK6-NEXT: shrxq %r9, %r13, %r12
+; FALLBACK6-NEXT: orq %r15, %r12
+; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi
+; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK6-NEXT: shrq %r11
+; FALLBACK6-NEXT: shrxq %r9, %r11, %r11
+; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK6-NEXT: shrq %r14
+; FALLBACK6-NEXT: shrxq %r9, %r14, %r14
+; FALLBACK6-NEXT: orq %r10, %r14
+; FALLBACK6-NEXT: shrq %rsi
+; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi
+; FALLBACK6-NEXT: orq %rbx, %rsi
+; FALLBACK6-NEXT: shrq %rax
+; FALLBACK6-NEXT: shrxq %r9, %rax, %rax
+; FALLBACK6-NEXT: orq %r8, %rax
+; FALLBACK6-NEXT: shrq %rbp
+; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8
+; FALLBACK6-NEXT: orq %r15, %r8
+; FALLBACK6-NEXT: movq %rcx, (%rdx)
+; FALLBACK6-NEXT: movq %r8, 56(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK6-NEXT: movq %r14, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r12, 32(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK6-NEXT: addq $24, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: shl_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: xorps %xmm4, %xmm4
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: negl %eax
+; FALLBACK7-NEXT: movslq %eax, %r8
+; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK7-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK7-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK7-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK7-NEXT: movq %r14, %r15
+; FALLBACK7-NEXT: shldq %cl, %r9, %r15
+; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK7-NEXT: shldq %cl, %r14, %r8
+; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK7-NEXT: movq %r8, 56(%rdx)
+; FALLBACK7-NEXT: movq %r15, 48(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK7-NEXT: movq %r10, 16(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK7-NEXT: movq %rax, 32(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK7-NEXT: movq %r9, (%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: shl_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK8-NEXT: movl (%rsi), %ecx
+; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%rcx,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %ecx
+; FALLBACK8-NEXT: negl %ecx
+; FALLBACK8-NEXT: movslq %ecx, %r9
+; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi
+; FALLBACK8-NEXT: movq %rdi, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK8-NEXT: movq %r11, %r8
+; FALLBACK8-NEXT: shrq %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r8
+; FALLBACK8-NEXT: orq %r10, %r8
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r10
+; FALLBACK8-NEXT: shrq %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: orq %r11, %r10
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK8-NEXT: movq %r15, %r11
+; FALLBACK8-NEXT: shrq %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK8-NEXT: movq %r12, %rbx
+; FALLBACK8-NEXT: shrq %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: orq %r15, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: movq %r14, %r15
+; FALLBACK8-NEXT: shrq %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r15
+; FALLBACK8-NEXT: orq %r12, %r15
+; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK8-NEXT: movq %r12, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r13
+; FALLBACK8-NEXT: shrq %rdi
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rdi
+; FALLBACK8-NEXT: orq %r13, %rdi
+; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r9
+; FALLBACK8-NEXT: shrq %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: orq %r9, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: movq %r14, (%rdx)
+; FALLBACK8-NEXT: movq %r12, 56(%rdx)
+; FALLBACK8-NEXT: movq %rdi, 48(%rdx)
+; FALLBACK8-NEXT: movq %r15, 8(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %r10, 32(%rdx)
+; FALLBACK8-NEXT: movq %r8, 40(%rdx)
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: shl_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: negl %eax
+; FALLBACK9-NEXT: movslq %eax, %r8
+; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK9-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK9-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK9-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK9-NEXT: movq %r14, %r15
+; FALLBACK9-NEXT: shldq %cl, %r9, %r15
+; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK9-NEXT: shldq %cl, %r14, %r8
+; FALLBACK9-NEXT: movq %r11, %r9
+; FALLBACK9-NEXT: shlq %cl, %r9
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK9-NEXT: movq %r8, 56(%rdx)
+; FALLBACK9-NEXT: movq %r15, 48(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK9-NEXT: movq %r10, 16(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK9-NEXT: movq %rax, 32(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK9-NEXT: movq %r9, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: shl_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: subq $24, %rsp
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %ecx
+; FALLBACK10-NEXT: andl $56, %ecx
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: negl %eax
+; FALLBACK10-NEXT: movslq %eax, %rsi
+; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax
+; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12
+; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi
+; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15
+; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13
+; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8
+; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11
+; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14
+; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx
+; FALLBACK10-NEXT: movl %ecx, %r9d
+; FALLBACK10-NEXT: notb %r9b
+; FALLBACK10-NEXT: shrq %rdi
+; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r12, %rdi
+; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp
+; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8
+; FALLBACK10-NEXT: shrq %r13
+; FALLBACK10-NEXT: shrxq %r9, %r13, %r12
+; FALLBACK10-NEXT: orq %r15, %r12
+; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi
+; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK10-NEXT: shrq %r11
+; FALLBACK10-NEXT: shrxq %r9, %r11, %r11
+; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK10-NEXT: shrq %r14
+; FALLBACK10-NEXT: shrxq %r9, %r14, %r14
+; FALLBACK10-NEXT: orq %r10, %r14
+; FALLBACK10-NEXT: shrq %rsi
+; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi
+; FALLBACK10-NEXT: orq %rbx, %rsi
+; FALLBACK10-NEXT: shrq %rax
+; FALLBACK10-NEXT: shrxq %r9, %rax, %rax
+; FALLBACK10-NEXT: orq %r8, %rax
+; FALLBACK10-NEXT: shrq %rbp
+; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8
+; FALLBACK10-NEXT: orq %r15, %r8
+; FALLBACK10-NEXT: movq %rcx, (%rdx)
+; FALLBACK10-NEXT: movq %r8, 56(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK10-NEXT: movq %r14, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r12, 32(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK10-NEXT: addq $24, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: shl_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: negl %eax
+; FALLBACK11-NEXT: movslq %eax, %r8
+; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK11-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK11-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK11-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK11-NEXT: movq %r14, %r15
+; FALLBACK11-NEXT: shldq %cl, %r9, %r15
+; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK11-NEXT: shldq %cl, %r14, %r8
+; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK11-NEXT: movq %r8, 56(%rdx)
+; FALLBACK11-NEXT: movq %r15, 48(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK11-NEXT: movq %r10, 16(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK11-NEXT: movq %rax, 32(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK11-NEXT: movq %r9, (%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: shl_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK12-NEXT: movl (%rsi), %ecx
+; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%rcx,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %ecx
+; FALLBACK12-NEXT: negl %ecx
+; FALLBACK12-NEXT: movslq %ecx, %r9
+; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi
+; FALLBACK12-NEXT: movq %rdi, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11
+; FALLBACK12-NEXT: movq %r11, %r8
+; FALLBACK12-NEXT: shrq %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r8
+; FALLBACK12-NEXT: orq %r10, %r8
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r10
+; FALLBACK12-NEXT: shrq %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: orq %r11, %r10
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15
+; FALLBACK12-NEXT: movq %r15, %r11
+; FALLBACK12-NEXT: shrq %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14
+; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12
+; FALLBACK12-NEXT: movq %r12, %rbx
+; FALLBACK12-NEXT: shrq %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: orq %r15, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: movq %r14, %r15
+; FALLBACK12-NEXT: shrq %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r15
+; FALLBACK12-NEXT: orq %r12, %r15
+; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12
+; FALLBACK12-NEXT: movq %r12, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r13
+; FALLBACK12-NEXT: shrq %rdi
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rdi
+; FALLBACK12-NEXT: orq %r13, %rdi
+; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r9
+; FALLBACK12-NEXT: shrq %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: orq %r9, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: movq %r14, (%rdx)
+; FALLBACK12-NEXT: movq %r12, 56(%rdx)
+; FALLBACK12-NEXT: movq %rdi, 48(%rdx)
+; FALLBACK12-NEXT: movq %r15, 8(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %r10, 32(%rdx)
+; FALLBACK12-NEXT: movq %r8, 40(%rdx)
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: shl_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK13-NEXT: movl (%rsi), %eax
+; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %eax
+; FALLBACK13-NEXT: negl %eax
+; FALLBACK13-NEXT: movslq %eax, %r8
+; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK13-NEXT: movq %r9, %rsi
+; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK13-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK13-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK13-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK13-NEXT: movq %r14, %r15
+; FALLBACK13-NEXT: shldq %cl, %r9, %r15
+; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK13-NEXT: shldq %cl, %r14, %r8
+; FALLBACK13-NEXT: movq %r11, %r9
+; FALLBACK13-NEXT: shlq %cl, %r9
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK13-NEXT: movq %r8, 56(%rdx)
+; FALLBACK13-NEXT: movq %r15, 48(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK13-NEXT: movq %r10, 16(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK13-NEXT: movq %rax, 32(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK13-NEXT: movq %r9, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: shl_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: subq $24, %rsp
+; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK14-NEXT: movl (%rsi), %eax
+; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rax,8), %ecx
+; FALLBACK14-NEXT: andl $56, %ecx
+; FALLBACK14-NEXT: andl $56, %eax
+; FALLBACK14-NEXT: negl %eax
+; FALLBACK14-NEXT: movslq %eax, %rsi
+; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax
+; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12
+; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi
+; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15
+; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13
+; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8
+; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11
+; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10
+; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14
+; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx
+; FALLBACK14-NEXT: movl %ecx, %r9d
+; FALLBACK14-NEXT: notb %r9b
+; FALLBACK14-NEXT: shrq %rdi
+; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r12, %rdi
+; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp
+; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8
+; FALLBACK14-NEXT: shrq %r13
+; FALLBACK14-NEXT: shrxq %r9, %r13, %r12
+; FALLBACK14-NEXT: orq %r15, %r12
+; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi
+; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx
+; FALLBACK14-NEXT: shrq %r11
+; FALLBACK14-NEXT: shrxq %r9, %r11, %r11
+; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; FALLBACK14-NEXT: shrq %r14
+; FALLBACK14-NEXT: shrxq %r9, %r14, %r14
+; FALLBACK14-NEXT: orq %r10, %r14
+; FALLBACK14-NEXT: shrq %rsi
+; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi
+; FALLBACK14-NEXT: orq %rbx, %rsi
+; FALLBACK14-NEXT: shrq %rax
+; FALLBACK14-NEXT: shrxq %r9, %rax, %rax
+; FALLBACK14-NEXT: orq %r8, %rax
+; FALLBACK14-NEXT: shrq %rbp
+; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8
+; FALLBACK14-NEXT: orq %r15, %r8
+; FALLBACK14-NEXT: movq %rcx, (%rdx)
+; FALLBACK14-NEXT: movq %r8, 56(%rdx)
+; FALLBACK14-NEXT: movq %rax, 48(%rdx)
+; FALLBACK14-NEXT: movq %rsi, 8(%rdx)
+; FALLBACK14-NEXT: movq %r14, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r12, 32(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 40(%rdx)
+; FALLBACK14-NEXT: addq $24, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: shl_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: negl %eax
+; FALLBACK15-NEXT: movslq %eax, %r8
+; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax
+; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
+; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi
+; FALLBACK15-NEXT: shldq %cl, %rdi, %rax
+; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10
+; FALLBACK15-NEXT: shldq %cl, %r10, %rdi
+; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11
+; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx
+; FALLBACK15-NEXT: shldq %cl, %rbx, %r10
+; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14
+; FALLBACK15-NEXT: movq %r14, %r15
+; FALLBACK15-NEXT: shldq %cl, %r9, %r15
+; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8
+; FALLBACK15-NEXT: shldq %cl, %r14, %r8
+; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shldq %cl, %r11, %rbx
+; FALLBACK15-NEXT: movq %r8, 56(%rdx)
+; FALLBACK15-NEXT: movq %r15, 48(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK15-NEXT: movq %r10, 16(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
+; FALLBACK15-NEXT: movq %rax, 32(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 40(%rdx)
+; FALLBACK15-NEXT: movq %r9, (%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: shl_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%eax), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%eax), %ebp
+; FALLBACK16-NEXT: movl 44(%eax), %ebx
+; FALLBACK16-NEXT: movl 48(%eax), %edi
+; FALLBACK16-NEXT: movl 52(%eax), %esi
+; FALLBACK16-NEXT: movl 56(%eax), %edx
+; FALLBACK16-NEXT: movl 60(%eax), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl (%eax), %eax
+; FALLBACK16-NEXT: xorps %xmm0, %xmm0
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %eax, %edx
+; FALLBACK16-NEXT: andl $60, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: subl %edx, %ecx
+; FALLBACK16-NEXT: movl (%ecx), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %edx
+; FALLBACK16-NEXT: movl %ecx, %ebp
+; FALLBACK16-NEXT: shll $3, %eax
+; FALLBACK16-NEXT: andl $24, %eax
+; FALLBACK16-NEXT: movl %edx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %al, %ch
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 8(%ebp), %esi
+; FALLBACK16-NEXT: movl %ebp, %edi
+; FALLBACK16-NEXT: movl %esi, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %ebx, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %edi, %ebp
+; FALLBACK16-NEXT: movl 20(%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 16(%edi), %esi
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movl 28(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 24(%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%edx), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 32(%edx), %esi
+; FALLBACK16-NEXT: movl %edx, %ebp
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %ebx, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 44(%ebp), %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl 40(%ebp), %esi
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %esi, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 52(%ebp), %esi
+; FALLBACK16-NEXT: movl %esi, %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: negl %edx
+; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK16-NEXT: movl %ebx, %ebp
+; FALLBACK16-NEXT: shrl %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %ebp
+; FALLBACK16-NEXT: orl %edi, %ebp
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shrl %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: orl %ebx, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT: movl 60(%edi), %edx
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl 56(%edi), %ebx
+; FALLBACK16-NEXT: movl %ebx, %edi
+; FALLBACK16-NEXT: shrl %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: orl %edx, %edi
+; FALLBACK16-NEXT: movb %al, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: shrl %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: orl %ebx, %esi
+; FALLBACK16-NEXT: movl %eax, %ecx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %edx, (%eax)
+; FALLBACK16-NEXT: movl %esi, 56(%eax)
+; FALLBACK16-NEXT: movl %edi, 60(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: shl_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%ecx), %eax
+; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%ecx), %ebp
+; FALLBACK17-NEXT: movl 44(%ecx), %ebx
+; FALLBACK17-NEXT: movl 48(%ecx), %edi
+; FALLBACK17-NEXT: movl 52(%ecx), %esi
+; FALLBACK17-NEXT: movl 56(%ecx), %edx
+; FALLBACK17-NEXT: movl 60(%ecx), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: xorps %xmm0, %xmm0
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: subl %ebp, %eax
+; FALLBACK17-NEXT: movl 8(%eax), %esi
+; FALLBACK17-NEXT: movl 12(%eax), %edx
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: movl %edx, %edi
+; FALLBACK17-NEXT: shldl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%eax), %edi
+; FALLBACK17-NEXT: movl 20(%eax), %esi
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%eax), %edi
+; FALLBACK17-NEXT: movl 28(%eax), %edx
+; FALLBACK17-NEXT: movl %edx, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%eax), %edi
+; FALLBACK17-NEXT: movl 36(%eax), %esi
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%eax), %edx
+; FALLBACK17-NEXT: movl 44(%eax), %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 56(%eax), %edx
+; FALLBACK17-NEXT: movl 60(%eax), %edi
+; FALLBACK17-NEXT: shldl %cl, %edx, %edi
+; FALLBACK17-NEXT: movl (%eax), %ebx
+; FALLBACK17-NEXT: movl 52(%eax), %esi
+; FALLBACK17-NEXT: shldl %cl, %esi, %edx
+; FALLBACK17-NEXT: negl %ebp
+; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl %edi, 60(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT: shll %cl, %ebx
+; FALLBACK17-NEXT: shldl %cl, %eax, %esi
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK17-NEXT: shldl %cl, %edi, %eax
+; FALLBACK17-NEXT: movl %eax, 48(%ebp)
+; FALLBACK17-NEXT: movl %esi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl %edx, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: shl_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebx
+; FALLBACK18-NEXT: movl 44(%eax), %edi
+; FALLBACK18-NEXT: movl 48(%eax), %esi
+; FALLBACK18-NEXT: movl 52(%eax), %edx
+; FALLBACK18-NEXT: movl 56(%eax), %ecx
+; FALLBACK18-NEXT: movl 60(%eax), %eax
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK18-NEXT: movl (%ebp), %ebp
+; FALLBACK18-NEXT: xorps %xmm0, %xmm0
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: leal (,%ebp,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ebp
+; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK18-NEXT: subl %ebp, %edi
+; FALLBACK18-NEXT: movl (%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%edi), %esi
+; FALLBACK18-NEXT: movl %esi, %ecx
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 12(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: movl 20(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 28(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%edi), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: movl 36(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%edi), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT: movl 44(%edi), %ecx
+; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %eax, %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 48(%edi), %esi
+; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 52(%edi), %esi
+; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrl %ecx
+; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK18-NEXT: orl %eax, %ebp
+; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT: negl %eax
+; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK18-NEXT: movl 56(%edi), %eax
+; FALLBACK18-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK18-NEXT: shrl %esi
+; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %edx, %esi
+; FALLBACK18-NEXT: shrl %eax
+; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT: orl %eax, %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, (%eax)
+; FALLBACK18-NEXT: movl %esi, 56(%eax)
+; FALLBACK18-NEXT: movl %ecx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebp, 48(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: shl_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $204, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl (%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%ebp), %ebx
+; FALLBACK19-NEXT: movl 44(%ebp), %edi
+; FALLBACK19-NEXT: movl 48(%ebp), %esi
+; FALLBACK19-NEXT: movl 52(%ebp), %edx
+; FALLBACK19-NEXT: movl 56(%ebp), %ecx
+; FALLBACK19-NEXT: movl 60(%ebp), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl (%ebp), %ebp
+; FALLBACK19-NEXT: xorps %xmm0, %xmm0
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: leal (,%ebp,8), %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: subl %ebp, %eax
+; FALLBACK19-NEXT: movl 4(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %edi
+; FALLBACK19-NEXT: movl 12(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %edi
+; FALLBACK19-NEXT: movl 20(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%eax), %edi
+; FALLBACK19-NEXT: movl 28(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%eax), %edi
+; FALLBACK19-NEXT: movl 36(%eax), %esi
+; FALLBACK19-NEXT: movl %esi, %ebx
+; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%eax), %ebx
+; FALLBACK19-NEXT: movl 44(%eax), %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK19-NEXT: movl 56(%eax), %edx
+; FALLBACK19-NEXT: movl 60(%eax), %edi
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl (%eax), %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 52(%eax), %esi
+; FALLBACK19-NEXT: shldl %cl, %esi, %edx
+; FALLBACK19-NEXT: negl %ebp
+; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl %edx, 56(%eax)
+; FALLBACK19-NEXT: movl %edi, 60(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %edi
+; FALLBACK19-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK19-NEXT: movl %ebp, 48(%eax)
+; FALLBACK19-NEXT: movl %esi, 52(%eax)
+; FALLBACK19-NEXT: movl %ebx, 40(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 44(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 32(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 36(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 24(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 28(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 16(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 20(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 8(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 12(%eax)
+; FALLBACK19-NEXT: movl %edi, 4(%eax)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, (%eax)
+; FALLBACK19-NEXT: addl $204, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: shl_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: xorps %xmm4, %xmm4
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: andl $60, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: subl %edx, %ecx
+; FALLBACK20-NEXT: movl (%ecx), %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 4(%ecx), %edx
+; FALLBACK20-NEXT: movl %ecx, %ebp
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 12(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 8(%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, %edi
+; FALLBACK20-NEXT: movl %esi, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl 20(%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 16(%edi), %esi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %esi, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %ebp, %edx
+; FALLBACK20-NEXT: movl 28(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 24(%ebp), %esi
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %esi, %ebp
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 36(%edx), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 32(%edx), %esi
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %ebx, %edi
+; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 44(%ebp), %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl 40(%ebp), %esi
+; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %esi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 52(%ebp), %esi
+; FALLBACK20-NEXT: movl %esi, %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: negl %edx
+; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: shrl %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: orl %edi, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT: movl 60(%edi), %edx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl 56(%edi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %edi
+; FALLBACK20-NEXT: shrl %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: shrl %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shrl %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %edx, (%eax)
+; FALLBACK20-NEXT: movl %esi, 56(%eax)
+; FALLBACK20-NEXT: movl %edi, 60(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: shl_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movups (%ecx), %xmm0
+; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK21-NEXT: movl (%eax), %ecx
+; FALLBACK21-NEXT: xorps %xmm4, %xmm4
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: subl %ebp, %eax
+; FALLBACK21-NEXT: movl 8(%eax), %esi
+; FALLBACK21-NEXT: movl 12(%eax), %edx
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: movl %edx, %edi
+; FALLBACK21-NEXT: shldl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 4(%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 16(%eax), %edi
+; FALLBACK21-NEXT: movl 20(%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 24(%eax), %edi
+; FALLBACK21-NEXT: movl 28(%eax), %edx
+; FALLBACK21-NEXT: movl %edx, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 32(%eax), %edi
+; FALLBACK21-NEXT: movl 36(%eax), %esi
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 40(%eax), %edx
+; FALLBACK21-NEXT: movl 44(%eax), %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 56(%eax), %edx
+; FALLBACK21-NEXT: movl 60(%eax), %edi
+; FALLBACK21-NEXT: shldl %cl, %edx, %edi
+; FALLBACK21-NEXT: movl (%eax), %ebx
+; FALLBACK21-NEXT: movl 52(%eax), %esi
+; FALLBACK21-NEXT: shldl %cl, %esi, %edx
+; FALLBACK21-NEXT: negl %ebp
+; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl %edi, 60(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK21-NEXT: shll %cl, %ebx
+; FALLBACK21-NEXT: shldl %cl, %eax, %esi
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK21-NEXT: shldl %cl, %edi, %eax
+; FALLBACK21-NEXT: movl %eax, 48(%ebp)
+; FALLBACK21-NEXT: movl %esi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl %edx, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: shl_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK22-NEXT: movl (%eax), %eax
+; FALLBACK22-NEXT: xorps %xmm4, %xmm4
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: leal (,%eax,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK22-NEXT: subl %eax, %edi
+; FALLBACK22-NEXT: movl (%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 4(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 8(%edi), %esi
+; FALLBACK22-NEXT: movl %esi, %ecx
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 12(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 16(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: movl 20(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 24(%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 28(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %eax, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 32(%edi), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: movl 36(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 40(%edi), %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT: movl 44(%edi), %ecx
+; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %eax, %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 48(%edi), %esi
+; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: movl 52(%edi), %esi
+; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrl %ecx
+; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK22-NEXT: orl %eax, %ebp
+; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT: negl %eax
+; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK22-NEXT: movl 56(%edi), %eax
+; FALLBACK22-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK22-NEXT: shrl %esi
+; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %edx, %esi
+; FALLBACK22-NEXT: shrl %eax
+; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT: orl %eax, %ecx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK22-NEXT: movl %edx, (%eax)
+; FALLBACK22-NEXT: movl %esi, 56(%eax)
+; FALLBACK22-NEXT: movl %ecx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebp, 48(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: shl_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $204, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movups (%ecx), %xmm0
+; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
+; FALLBACK23-NEXT: movl (%eax), %ebp
+; FALLBACK23-NEXT: xorps %xmm4, %xmm4
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: leal (,%ebp,8), %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: subl %ebp, %eax
+; FALLBACK23-NEXT: movl 4(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 8(%eax), %edi
+; FALLBACK23-NEXT: movl 12(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 16(%eax), %edi
+; FALLBACK23-NEXT: movl 20(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 24(%eax), %edi
+; FALLBACK23-NEXT: movl 28(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 32(%eax), %edi
+; FALLBACK23-NEXT: movl 36(%eax), %esi
+; FALLBACK23-NEXT: movl %esi, %ebx
+; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 40(%eax), %ebx
+; FALLBACK23-NEXT: movl 44(%eax), %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT: movl 56(%eax), %edx
+; FALLBACK23-NEXT: movl 60(%eax), %edi
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl (%eax), %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 52(%eax), %esi
+; FALLBACK23-NEXT: shldl %cl, %esi, %edx
+; FALLBACK23-NEXT: negl %ebp
+; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movl %edx, 56(%eax)
+; FALLBACK23-NEXT: movl %edi, 60(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %edx, %edi
+; FALLBACK23-NEXT: shldl %cl, %ebp, %esi
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shldl %cl, %edx, %ebp
+; FALLBACK23-NEXT: movl %ebp, 48(%eax)
+; FALLBACK23-NEXT: movl %esi, 52(%eax)
+; FALLBACK23-NEXT: movl %ebx, 40(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 44(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 32(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 36(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 24(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 28(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 16(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 20(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 8(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 12(%eax)
+; FALLBACK23-NEXT: movl %edi, 4(%eax)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, (%eax)
+; FALLBACK23-NEXT: addl $204, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: shl_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT: movl (%eax), %eax
+; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: andl $60, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: subl %edx, %ecx
+; FALLBACK24-NEXT: movl (%ecx), %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 4(%ecx), %edx
+; FALLBACK24-NEXT: movl %ecx, %ebp
+; FALLBACK24-NEXT: shll $3, %eax
+; FALLBACK24-NEXT: andl $24, %eax
+; FALLBACK24-NEXT: movl %edx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %al, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 12(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 8(%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, %edi
+; FALLBACK24-NEXT: movl %esi, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl 20(%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 16(%edi), %esi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %esi, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %ebp, %edx
+; FALLBACK24-NEXT: movl 28(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 24(%ebp), %esi
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %esi, %ebp
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 36(%edx), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 32(%edx), %esi
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %ebx, %edi
+; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 44(%ebp), %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl 40(%ebp), %esi
+; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %esi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 52(%ebp), %esi
+; FALLBACK24-NEXT: movl %esi, %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: negl %edx
+; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: shrl %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: orl %edi, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT: movl 60(%edi), %edx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl 56(%edi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %edi
+; FALLBACK24-NEXT: shrl %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: shrl %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shrl %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %edx, (%eax)
+; FALLBACK24-NEXT: movl %esi, 56(%eax)
+; FALLBACK24-NEXT: movl %edi, 60(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: shl_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK25-NEXT: movl (%eax), %ecx
+; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: subl %ebp, %eax
+; FALLBACK25-NEXT: movl 8(%eax), %esi
+; FALLBACK25-NEXT: movl 12(%eax), %edx
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: movl %edx, %edi
+; FALLBACK25-NEXT: shldl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 4(%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 16(%eax), %edi
+; FALLBACK25-NEXT: movl 20(%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 24(%eax), %edi
+; FALLBACK25-NEXT: movl 28(%eax), %edx
+; FALLBACK25-NEXT: movl %edx, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 32(%eax), %edi
+; FALLBACK25-NEXT: movl 36(%eax), %esi
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 40(%eax), %edx
+; FALLBACK25-NEXT: movl 44(%eax), %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 56(%eax), %edx
+; FALLBACK25-NEXT: movl 60(%eax), %edi
+; FALLBACK25-NEXT: shldl %cl, %edx, %edi
+; FALLBACK25-NEXT: movl (%eax), %ebx
+; FALLBACK25-NEXT: movl 52(%eax), %esi
+; FALLBACK25-NEXT: shldl %cl, %esi, %edx
+; FALLBACK25-NEXT: negl %ebp
+; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl %edi, 60(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK25-NEXT: shll %cl, %ebx
+; FALLBACK25-NEXT: shldl %cl, %eax, %esi
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK25-NEXT: shldl %cl, %edi, %eax
+; FALLBACK25-NEXT: movl %eax, 48(%ebp)
+; FALLBACK25-NEXT: movl %esi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl %edx, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: shl_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK26-NEXT: movl (%eax), %eax
+; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: leal (,%eax,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK26-NEXT: subl %eax, %edi
+; FALLBACK26-NEXT: movl (%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 4(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 8(%edi), %esi
+; FALLBACK26-NEXT: movl %esi, %ecx
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 12(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 16(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: movl 20(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 24(%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 28(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 32(%edi), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: movl 36(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 40(%edi), %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT: movl 44(%edi), %ecx
+; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %eax, %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 48(%edi), %esi
+; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 52(%edi), %esi
+; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrl %ecx
+; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK26-NEXT: orl %eax, %ebp
+; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT: negl %eax
+; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK26-NEXT: movl 56(%edi), %eax
+; FALLBACK26-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK26-NEXT: shrl %esi
+; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %edx, %esi
+; FALLBACK26-NEXT: shrl %eax
+; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT: orl %eax, %ecx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK26-NEXT: movl %edx, (%eax)
+; FALLBACK26-NEXT: movl %esi, 56(%eax)
+; FALLBACK26-NEXT: movl %ecx, 60(%eax)
+; FALLBACK26-NEXT: movl %ebp, 48(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 52(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 40(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 44(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 32(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 36(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 24(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 28(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 16(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: shl_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $204, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
+; FALLBACK27-NEXT: movl (%eax), %ebx
+; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: leal (,%ebx,8), %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: andl $60, %ebx
+; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: subl %ebx, %eax
+; FALLBACK27-NEXT: movl 4(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 8(%eax), %edi
+; FALLBACK27-NEXT: movl 12(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 16(%eax), %edi
+; FALLBACK27-NEXT: movl 20(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 24(%eax), %edi
+; FALLBACK27-NEXT: movl 28(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 32(%eax), %edi
+; FALLBACK27-NEXT: movl 36(%eax), %esi
+; FALLBACK27-NEXT: movl %esi, %ebp
+; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 40(%eax), %ebp
+; FALLBACK27-NEXT: movl 44(%eax), %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK27-NEXT: movl 56(%eax), %edx
+; FALLBACK27-NEXT: movl 60(%eax), %edi
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl (%eax), %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 52(%eax), %esi
+; FALLBACK27-NEXT: shldl %cl, %esi, %edx
+; FALLBACK27-NEXT: negl %ebx
+; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: movl %edx, 56(%eax)
+; FALLBACK27-NEXT: movl %edi, 60(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %edx, %edi
+; FALLBACK27-NEXT: shldl %cl, %ebx, %esi
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK27-NEXT: movl %ebx, 48(%eax)
+; FALLBACK27-NEXT: movl %esi, 52(%eax)
+; FALLBACK27-NEXT: movl %ebp, 40(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 44(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 32(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 36(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 24(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 28(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 16(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 20(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 8(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 12(%eax)
+; FALLBACK27-NEXT: movl %edi, 4(%eax)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, (%eax)
+; FALLBACK27-NEXT: addl $204, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: shl_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK28-NEXT: movl (%eax), %eax
+; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: andl $60, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: subl %edx, %ecx
+; FALLBACK28-NEXT: movl (%ecx), %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 4(%ecx), %edx
+; FALLBACK28-NEXT: movl %ecx, %ebp
+; FALLBACK28-NEXT: shll $3, %eax
+; FALLBACK28-NEXT: andl $24, %eax
+; FALLBACK28-NEXT: movl %edx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %al, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 12(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 8(%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, %edi
+; FALLBACK28-NEXT: movl %esi, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl 20(%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 16(%edi), %esi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %esi, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %ebp, %edx
+; FALLBACK28-NEXT: movl 28(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 24(%ebp), %esi
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %esi, %ebp
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 36(%edx), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 32(%edx), %esi
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %ebx, %edi
+; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 44(%ebp), %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl 40(%ebp), %esi
+; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %esi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 52(%ebp), %esi
+; FALLBACK28-NEXT: movl %esi, %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: negl %edx
+; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: shrl %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: orl %edi, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT: movl 60(%edi), %edx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl 56(%edi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %edi
+; FALLBACK28-NEXT: shrl %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: shrl %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shrl %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %edx, (%eax)
+; FALLBACK28-NEXT: movl %esi, 56(%eax)
+; FALLBACK28-NEXT: movl %edi, 60(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: shl_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK29-NEXT: movl (%eax), %ecx
+; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: subl %ebp, %eax
+; FALLBACK29-NEXT: movl 8(%eax), %esi
+; FALLBACK29-NEXT: movl 12(%eax), %edx
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: movl %edx, %edi
+; FALLBACK29-NEXT: shldl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 4(%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 16(%eax), %edi
+; FALLBACK29-NEXT: movl 20(%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 24(%eax), %edi
+; FALLBACK29-NEXT: movl 28(%eax), %edx
+; FALLBACK29-NEXT: movl %edx, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 32(%eax), %edi
+; FALLBACK29-NEXT: movl 36(%eax), %esi
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 40(%eax), %edx
+; FALLBACK29-NEXT: movl 44(%eax), %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 56(%eax), %edx
+; FALLBACK29-NEXT: movl 60(%eax), %edi
+; FALLBACK29-NEXT: shldl %cl, %edx, %edi
+; FALLBACK29-NEXT: movl (%eax), %ebx
+; FALLBACK29-NEXT: movl 52(%eax), %esi
+; FALLBACK29-NEXT: shldl %cl, %esi, %edx
+; FALLBACK29-NEXT: negl %ebp
+; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl %edi, 60(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK29-NEXT: shll %cl, %ebx
+; FALLBACK29-NEXT: shldl %cl, %eax, %esi
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK29-NEXT: shldl %cl, %edi, %eax
+; FALLBACK29-NEXT: movl %eax, 48(%ebp)
+; FALLBACK29-NEXT: movl %esi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl %edx, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: shl_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK30-NEXT: movl (%eax), %eax
+; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: leal (,%eax,8), %edx
+; FALLBACK30-NEXT: andl $24, %edx
+; FALLBACK30-NEXT: andl $60, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi
+; FALLBACK30-NEXT: subl %eax, %edi
+; FALLBACK30-NEXT: movl (%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 4(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %edx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi
+; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 8(%edi), %esi
+; FALLBACK30-NEXT: movl %esi, %ecx
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 12(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 16(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: movl 20(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 24(%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 28(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 32(%edi), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: movl 36(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 40(%edi), %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT: movl 44(%edi), %ecx
+; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %eax, %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 48(%edi), %esi
+; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 52(%edi), %esi
+; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrl %ecx
+; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp
+; FALLBACK30-NEXT: orl %eax, %ebp
+; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT: negl %eax
+; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
+; FALLBACK30-NEXT: movl 56(%edi), %eax
+; FALLBACK30-NEXT: shlxl %edx, %eax, %edx
+; FALLBACK30-NEXT: shrl %esi
+; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %edx, %esi
+; FALLBACK30-NEXT: shrl %eax
+; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT: orl %eax, %ecx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK30-NEXT: movl %edx, (%eax)
+; FALLBACK30-NEXT: movl %esi, 56(%eax)
+; FALLBACK30-NEXT: movl %ecx, 60(%eax)
+; FALLBACK30-NEXT: movl %ebp, 48(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 52(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 40(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 44(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 32(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 36(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 24(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 28(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 16(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: shl_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $204, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
+; FALLBACK31-NEXT: movl (%eax), %ebx
+; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: leal (,%ebx,8), %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: andl $60, %ebx
+; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: subl %ebx, %eax
+; FALLBACK31-NEXT: movl 4(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 8(%eax), %edi
+; FALLBACK31-NEXT: movl 12(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 16(%eax), %edi
+; FALLBACK31-NEXT: movl 20(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 24(%eax), %edi
+; FALLBACK31-NEXT: movl 28(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 32(%eax), %edi
+; FALLBACK31-NEXT: movl 36(%eax), %esi
+; FALLBACK31-NEXT: movl %esi, %ebp
+; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
+; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 40(%eax), %ebp
+; FALLBACK31-NEXT: movl 44(%eax), %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %ebp, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
+; FALLBACK31-NEXT: movl 56(%eax), %edx
+; FALLBACK31-NEXT: movl 60(%eax), %edi
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl (%eax), %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 52(%eax), %esi
+; FALLBACK31-NEXT: shldl %cl, %esi, %edx
+; FALLBACK31-NEXT: negl %ebx
+; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: movl %edx, 56(%eax)
+; FALLBACK31-NEXT: movl %edi, 60(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %edx, %edi
+; FALLBACK31-NEXT: shldl %cl, %ebx, %esi
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
+; FALLBACK31-NEXT: movl %ebx, 48(%eax)
+; FALLBACK31-NEXT: movl %esi, 52(%eax)
+; FALLBACK31-NEXT: movl %ebp, 40(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 44(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 32(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 36(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 24(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 28(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 16(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 20(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 8(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 12(%eax)
+; FALLBACK31-NEXT: movl %edi, 4(%eax)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, (%eax)
+; FALLBACK31-NEXT: addl $204, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i512, ptr %src.ptr, align 1
+ %byteOff = load i512, ptr %byteOff.ptr, align 1
+ %bitOff = shl i512 %byteOff, 3
+ %res = shl i512 %src, %bitOff
+ store i512 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: shl_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rbx
; X64-SSE2-NEXT: movq (%rdi), %rax
@@ -2012,6 +19811,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq 48(%rdi), %rbx
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-SSE2-NEXT: movl (%rsi), %esi
+; X64-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -2020,15 +19824,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
+; X64-SSE2-NEXT: shll $3, %esi
+; X64-SSE2-NEXT: andl $56, %esi
; X64-SSE2-NEXT: negl %esi
; X64-SSE2-NEXT: movslq %esi, %rax
; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx
@@ -2050,23 +19847,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: popq %rbx
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: shl_64bytes:
+; X64-SSE42-LABEL: shl_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rax
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
; X64-SSE42-NEXT: movl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %eax
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: shll $3, %eax
+; X64-SSE42-NEXT: andl $56, %eax
; X64-SSE42-NEXT: negl %eax
; X64-SSE42-NEXT: cltq
; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
@@ -2077,10 +19876,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX1-LABEL: shl_64bytes:
+; X64-AVX1-LABEL: shl_64bytes_qwordOff:
; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT: movl (%rsi), %eax
@@ -2089,7 +19890,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: andl $63, %eax
+; X64-AVX1-NEXT: shll $3, %eax
+; X64-AVX1-NEXT: andl $56, %eax
; X64-AVX1-NEXT: negl %eax
; X64-AVX1-NEXT: cltq
; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0
@@ -2100,17 +19902,20 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT: popq %rax
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: shl_64bytes:
+; X64-AVX512-LABEL: shl_64bytes_qwordOff:
; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: pushq %rax
; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-AVX512-NEXT: movl (%rsi), %eax
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $63, %eax
+; X64-AVX512-NEXT: shll $3, %eax
+; X64-AVX512-NEXT: andl $56, %eax
; X64-AVX512-NEXT: negl %eax
; X64-AVX512-NEXT: cltq
; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
@@ -2121,117 +19926,108 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX512-NEXT: popq %rax
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
;
-; X86-SSE2-LABEL: shl_64bytes:
+; X86-SSE2-LABEL: shl_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 40(%eax), %ebp
-; X86-SSE2-NEXT: movl 44(%eax), %ebx
-; X86-SSE2-NEXT: movl 48(%eax), %edi
-; X86-SSE2-NEXT: movl 52(%eax), %esi
-; X86-SSE2-NEXT: movl 56(%eax), %edx
-; X86-SSE2-NEXT: movl 60(%eax), %ecx
-; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movl (%eax), %eax
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: subl $188, %esp
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl (%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 8(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 12(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 16(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 20(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 28(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 32(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 36(%ecx), %eax
+; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 40(%ecx), %ebp
+; X86-SSE2-NEXT: movl 44(%ecx), %ebx
+; X86-SSE2-NEXT: movl 48(%ecx), %edi
+; X86-SSE2-NEXT: movl 52(%ecx), %esi
+; X86-SSE2-NEXT: movl 56(%ecx), %edx
+; X86-SSE2-NEXT: movl 60(%ecx), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movl (%ecx), %ecx
+; X86-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT: subl %eax, %ecx
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%ecx), %edx
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT: shll $3, %ecx
+; X86-SSE2-NEXT: andl $56, %ecx
+; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: subl %ecx, %eax
+; X86-SSE2-NEXT: movl (%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 4(%ecx), %edx
+; X86-SSE2-NEXT: movl 4(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 12(%ecx), %edx
+; X86-SSE2-NEXT: movl 12(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 8(%ecx), %edx
+; X86-SSE2-NEXT: movl 8(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 20(%ecx), %edx
+; X86-SSE2-NEXT: movl 20(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 16(%ecx), %edx
+; X86-SSE2-NEXT: movl 16(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 28(%ecx), %edx
+; X86-SSE2-NEXT: movl 28(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 24(%ecx), %edx
+; X86-SSE2-NEXT: movl 24(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 36(%ecx), %edx
+; X86-SSE2-NEXT: movl 36(%eax), %edx
; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 32(%ecx), %edx
-; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%ecx), %ebp
-; X86-SSE2-NEXT: movl 40(%ecx), %ebx
-; X86-SSE2-NEXT: movl 52(%ecx), %edi
-; X86-SSE2-NEXT: movl 60(%ecx), %esi
-; X86-SSE2-NEXT: movl 56(%ecx), %edx
-; X86-SSE2-NEXT: negl %eax
-; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 32(%eax), %edx
+; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 44(%eax), %ebp
+; X86-SSE2-NEXT: movl 40(%eax), %ebx
+; X86-SSE2-NEXT: movl 52(%eax), %edi
+; X86-SSE2-NEXT: movl 60(%eax), %esi
+; X86-SSE2-NEXT: movl 56(%eax), %edx
+; X86-SSE2-NEXT: negl %ecx
+; X86-SSE2-NEXT: movl 160(%esp,%ecx), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %edx, 56(%eax)
; X86-SSE2-NEXT: movl %esi, 60(%eax)
@@ -2239,7 +20035,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, 52(%eax)
; X86-SSE2-NEXT: movl %ebx, 40(%eax)
; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 32(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 36(%eax)
@@ -2259,16 +20055,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, (%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
+; X86-SSE2-NEXT: addl $188, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: shl_64bytes:
+; X86-SSE42-LABEL: shl_64bytes_qwordOff:
; X86-SSE42: # %bb.0:
-; X86-SSE42-NEXT: subl $128, %esp
+; X86-SSE42-NEXT: subl $140, %esp
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2278,15 +20074,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups 48(%edx), %xmm3
; X86-SSE42-NEXT: movl (%ecx), %ecx
; X86-SSE42-NEXT: xorps %xmm4, %xmm4
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm4, (%esp)
-; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm4, (%esp)
+; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: shll $3, %ecx
+; X86-SSE42-NEXT: andl $56, %ecx
; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-SSE42-NEXT: subl %ecx, %edx
; X86-SSE42-NEXT: movups (%edx), %xmm0
@@ -2298,12 +20095,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
; X86-SSE42-NEXT: movups %xmm0, (%eax)
-; X86-SSE42-NEXT: addl $128, %esp
+; X86-SSE42-NEXT: addl $140, %esp
; X86-SSE42-NEXT: retl
;
-; X86-AVX1-LABEL: shl_64bytes:
+; X86-AVX1-LABEL: shl_64bytes_qwordOff:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: subl $128, %esp
+; X86-AVX1-NEXT: subl $140, %esp
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2315,7 +20112,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX1-NEXT: vmovups %ymm2, (%esp)
; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX1-NEXT: andl $63, %ecx
+; X86-AVX1-NEXT: shll $3, %ecx
+; X86-AVX1-NEXT: andl $56, %ecx
; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-AVX1-NEXT: subl %ecx, %edx
; X86-AVX1-NEXT: vmovups (%edx), %xmm0
@@ -2327,13 +20125,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $128, %esp
+; X86-AVX1-NEXT: addl $140, %esp
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: shl_64bytes:
+; X86-AVX512-LABEL: shl_64bytes_qwordOff:
; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $128, %esp
+; X86-AVX512-NEXT: subl $140, %esp
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2342,7 +20140,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vmovups %zmm1, (%esp)
; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: andl $63, %ecx
+; X86-AVX512-NEXT: shll $3, %ecx
+; X86-AVX512-NEXT: andl $56, %ecx
; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
; X86-AVX512-NEXT: subl %ecx, %edx
; X86-AVX512-NEXT: vmovups (%edx), %xmm0
@@ -2354,18 +20153,4121 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $128, %esp
+; X86-AVX512-NEXT: addl $140, %esp
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
- %byteOff = load i512, ptr %byteOff.ptr, align 1
- %bitOff = shl i512 %byteOff, 3
+ %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i512 %qwordOff, 6
%res = shl i512 %src, %bitOff
store i512 %res, ptr %dst, align 1
ret void
}
+
define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; X64-SSE2-LABEL: ashr_64bytes:
+; FALLBACK0-LABEL: ashr_64bytes:
+; FALLBACK0: # %bb.0:
+; FALLBACK0-NEXT: pushq %r15
+; FALLBACK0-NEXT: pushq %r14
+; FALLBACK0-NEXT: pushq %r13
+; FALLBACK0-NEXT: pushq %r12
+; FALLBACK0-NEXT: pushq %rbx
+; FALLBACK0-NEXT: movq (%rdi), %rax
+; FALLBACK0-NEXT: movq 8(%rdi), %rcx
+; FALLBACK0-NEXT: movq 16(%rdi), %r8
+; FALLBACK0-NEXT: movq 24(%rdi), %r9
+; FALLBACK0-NEXT: movq 32(%rdi), %r10
+; FALLBACK0-NEXT: movq 40(%rdi), %r11
+; FALLBACK0-NEXT: movq 48(%rdi), %rbx
+; FALLBACK0-NEXT: movq 56(%rdi), %r14
+; FALLBACK0-NEXT: movl (%rsi), %edi
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: sarq $63, %r14
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK0-NEXT: leal (,%rdi,8), %eax
+; FALLBACK0-NEXT: andl $56, %eax
+; FALLBACK0-NEXT: andl $56, %edi
+; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
+; FALLBACK0-NEXT: movq %r8, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r11
+; FALLBACK0-NEXT: movl %eax, %esi
+; FALLBACK0-NEXT: notb %sil
+; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r9
+; FALLBACK0-NEXT: orq %r11, %r9
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r10
+; FALLBACK0-NEXT: addq %r8, %r8
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r8
+; FALLBACK0-NEXT: orq %r10, %r8
+; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK0-NEXT: movq %r10, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r15
+; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14
+; FALLBACK0-NEXT: leaq (%r14,%r14), %r11
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r11
+; FALLBACK0-NEXT: orq %r15, %r11
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %rbx
+; FALLBACK0-NEXT: addq %r10, %r10
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r10
+; FALLBACK0-NEXT: orq %rbx, %r10
+; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx
+; FALLBACK0-NEXT: movq %rbx, %r12
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r12
+; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
+; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r15
+; FALLBACK0-NEXT: orq %r12, %r15
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r14
+; FALLBACK0-NEXT: addq %rbx, %rbx
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %rbx
+; FALLBACK0-NEXT: orq %r14, %rbx
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: shrq %cl, %r13
+; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14
+; FALLBACK0-NEXT: movl %esi, %ecx
+; FALLBACK0-NEXT: shlq %cl, %r14
+; FALLBACK0-NEXT: orq %r13, %r14
+; FALLBACK0-NEXT: movl %eax, %ecx
+; FALLBACK0-NEXT: sarq %cl, %rdi
+; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK0-NEXT: movq %r14, 48(%rdx)
+; FALLBACK0-NEXT: movq %rbx, 32(%rdx)
+; FALLBACK0-NEXT: movq %r15, 40(%rdx)
+; FALLBACK0-NEXT: movq %r10, 16(%rdx)
+; FALLBACK0-NEXT: movq %r11, 24(%rdx)
+; FALLBACK0-NEXT: movq %r8, (%rdx)
+; FALLBACK0-NEXT: movq %r9, 8(%rdx)
+; FALLBACK0-NEXT: popq %rbx
+; FALLBACK0-NEXT: popq %r12
+; FALLBACK0-NEXT: popq %r13
+; FALLBACK0-NEXT: popq %r14
+; FALLBACK0-NEXT: popq %r15
+; FALLBACK0-NEXT: retq
+;
+; FALLBACK1-LABEL: ashr_64bytes:
+; FALLBACK1: # %bb.0:
+; FALLBACK1-NEXT: pushq %r15
+; FALLBACK1-NEXT: pushq %r14
+; FALLBACK1-NEXT: pushq %rbx
+; FALLBACK1-NEXT: movq (%rdi), %rcx
+; FALLBACK1-NEXT: movq 8(%rdi), %r8
+; FALLBACK1-NEXT: movq 16(%rdi), %r9
+; FALLBACK1-NEXT: movq 24(%rdi), %r10
+; FALLBACK1-NEXT: movq 32(%rdi), %r11
+; FALLBACK1-NEXT: movq 40(%rdi), %rbx
+; FALLBACK1-NEXT: movq 48(%rdi), %r14
+; FALLBACK1-NEXT: movq 56(%rdi), %rdi
+; FALLBACK1-NEXT: movl (%rsi), %eax
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: sarq $63, %rdi
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK1-NEXT: leal (,%rax,8), %ecx
+; FALLBACK1-NEXT: andl $56, %ecx
+; FALLBACK1-NEXT: andl $56, %eax
+; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK1-NEXT: movq %r9, %r8
+; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq %r11, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK1-NEXT: movq %r14, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK1-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK1-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK1-NEXT: sarq %cl, %rax
+; FALLBACK1-NEXT: movq %r11, 48(%rdx)
+; FALLBACK1-NEXT: movq %rax, 56(%rdx)
+; FALLBACK1-NEXT: movq %r10, 32(%rdx)
+; FALLBACK1-NEXT: movq %r15, 40(%rdx)
+; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK1-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK1-NEXT: movq %rsi, (%rdx)
+; FALLBACK1-NEXT: movq %r8, 8(%rdx)
+; FALLBACK1-NEXT: popq %rbx
+; FALLBACK1-NEXT: popq %r14
+; FALLBACK1-NEXT: popq %r15
+; FALLBACK1-NEXT: retq
+;
+; FALLBACK2-LABEL: ashr_64bytes:
+; FALLBACK2: # %bb.0:
+; FALLBACK2-NEXT: pushq %rbp
+; FALLBACK2-NEXT: pushq %r15
+; FALLBACK2-NEXT: pushq %r14
+; FALLBACK2-NEXT: pushq %r13
+; FALLBACK2-NEXT: pushq %r12
+; FALLBACK2-NEXT: pushq %rbx
+; FALLBACK2-NEXT: pushq %rax
+; FALLBACK2-NEXT: movq (%rdi), %rcx
+; FALLBACK2-NEXT: movq 8(%rdi), %r8
+; FALLBACK2-NEXT: movq 16(%rdi), %r9
+; FALLBACK2-NEXT: movq 24(%rdi), %r10
+; FALLBACK2-NEXT: movq 32(%rdi), %r11
+; FALLBACK2-NEXT: movq 40(%rdi), %rbx
+; FALLBACK2-NEXT: movq 48(%rdi), %r14
+; FALLBACK2-NEXT: movq 56(%rdi), %rdi
+; FALLBACK2-NEXT: movl (%rsi), %eax
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: sarq $63, %rdi
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK2-NEXT: leal (,%rax,8), %ecx
+; FALLBACK2-NEXT: andl $56, %ecx
+; FALLBACK2-NEXT: andl $56, %eax
+; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi
+; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9
+; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13
+; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
+; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8
+; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
+; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT: movl %ecx, %r12d
+; FALLBACK2-NEXT: notb %r12b
+; FALLBACK2-NEXT: addq %r9, %r9
+; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
+; FALLBACK2-NEXT: orq %rbx, %r9
+; FALLBACK2-NEXT: addq %rdi, %rdi
+; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
+; FALLBACK2-NEXT: orq %r13, %rdi
+; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
+; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx
+; FALLBACK2-NEXT: addq %r10, %r10
+; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT: orq %r8, %r10
+; FALLBACK2-NEXT: addq %rsi, %rsi
+; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
+; FALLBACK2-NEXT: orq %r11, %rsi
+; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
+; FALLBACK2-NEXT: orq %r15, %r8
+; FALLBACK2-NEXT: addq %r14, %r14
+; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
+; FALLBACK2-NEXT: orq %rbp, %r11
+; FALLBACK2-NEXT: addq %rax, %rax
+; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
+; FALLBACK2-NEXT: orq %r13, %rax
+; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
+; FALLBACK2-NEXT: movq %rax, 48(%rdx)
+; FALLBACK2-NEXT: movq %r11, 32(%rdx)
+; FALLBACK2-NEXT: movq %r8, 40(%rdx)
+; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
+; FALLBACK2-NEXT: movq %r10, 24(%rdx)
+; FALLBACK2-NEXT: movq %rdi, (%rdx)
+; FALLBACK2-NEXT: movq %r9, 8(%rdx)
+; FALLBACK2-NEXT: addq $8, %rsp
+; FALLBACK2-NEXT: popq %rbx
+; FALLBACK2-NEXT: popq %r12
+; FALLBACK2-NEXT: popq %r13
+; FALLBACK2-NEXT: popq %r14
+; FALLBACK2-NEXT: popq %r15
+; FALLBACK2-NEXT: popq %rbp
+; FALLBACK2-NEXT: retq
+;
+; FALLBACK3-LABEL: ashr_64bytes:
+; FALLBACK3: # %bb.0:
+; FALLBACK3-NEXT: pushq %r15
+; FALLBACK3-NEXT: pushq %r14
+; FALLBACK3-NEXT: pushq %rbx
+; FALLBACK3-NEXT: movq (%rdi), %rcx
+; FALLBACK3-NEXT: movq 8(%rdi), %r8
+; FALLBACK3-NEXT: movq 16(%rdi), %r9
+; FALLBACK3-NEXT: movq 24(%rdi), %r10
+; FALLBACK3-NEXT: movq 32(%rdi), %r11
+; FALLBACK3-NEXT: movq 40(%rdi), %rbx
+; FALLBACK3-NEXT: movq 48(%rdi), %r14
+; FALLBACK3-NEXT: movq 56(%rdi), %rdi
+; FALLBACK3-NEXT: movl (%rsi), %eax
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: sarq $63, %rdi
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK3-NEXT: leal (,%rax,8), %ecx
+; FALLBACK3-NEXT: andl $56, %ecx
+; FALLBACK3-NEXT: andl $56, %eax
+; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
+; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
+; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
+; FALLBACK3-NEXT: movq %r9, %r8
+; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
+; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10
+; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq %r11, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx
+; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11
+; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14
+; FALLBACK3-NEXT: movq %r14, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r11, %r15
+; FALLBACK3-NEXT: shrdq %cl, %r14, %r10
+; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK3-NEXT: shrdq %cl, %rax, %r11
+; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
+; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT: movq %r11, 48(%rdx)
+; FALLBACK3-NEXT: movq %r10, 32(%rdx)
+; FALLBACK3-NEXT: movq %r15, 40(%rdx)
+; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK3-NEXT: movq %rbx, 24(%rdx)
+; FALLBACK3-NEXT: movq %rsi, (%rdx)
+; FALLBACK3-NEXT: movq %r8, 8(%rdx)
+; FALLBACK3-NEXT: movq %rax, 56(%rdx)
+; FALLBACK3-NEXT: popq %rbx
+; FALLBACK3-NEXT: popq %r14
+; FALLBACK3-NEXT: popq %r15
+; FALLBACK3-NEXT: retq
+;
+; FALLBACK4-LABEL: ashr_64bytes:
+; FALLBACK4: # %bb.0:
+; FALLBACK4-NEXT: pushq %rbp
+; FALLBACK4-NEXT: pushq %r15
+; FALLBACK4-NEXT: pushq %r14
+; FALLBACK4-NEXT: pushq %r13
+; FALLBACK4-NEXT: pushq %r12
+; FALLBACK4-NEXT: pushq %rbx
+; FALLBACK4-NEXT: pushq %rax
+; FALLBACK4-NEXT: movups (%rdi), %xmm0
+; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK4-NEXT: movq 48(%rdi), %rax
+; FALLBACK4-NEXT: movq 56(%rdi), %rcx
+; FALLBACK4-NEXT: movl (%rsi), %edi
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: sarq $63, %rcx
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK4-NEXT: leal (,%rdi,8), %eax
+; FALLBACK4-NEXT: andl $56, %eax
+; FALLBACK4-NEXT: andl $56, %edi
+; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r10
+; FALLBACK4-NEXT: movl %eax, %esi
+; FALLBACK4-NEXT: notb %sil
+; FALLBACK4-NEXT: leaq (%r9,%r9), %r8
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r8
+; FALLBACK4-NEXT: orq %r10, %r8
+; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK4-NEXT: movq %r10, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbx
+; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r11
+; FALLBACK4-NEXT: orq %rbx, %r11
+; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK4-NEXT: movq %rbx, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r14
+; FALLBACK4-NEXT: addq %r10, %r10
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r10
+; FALLBACK4-NEXT: orq %r14, %r10
+; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK4-NEXT: movq %r14, %r13
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r13
+; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r15
+; FALLBACK4-NEXT: orq %r13, %r15
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r12
+; FALLBACK4-NEXT: addq %r14, %r14
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r14
+; FALLBACK4-NEXT: orq %r12, %r14
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %rbp
+; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %r12
+; FALLBACK4-NEXT: orq %rbp, %r12
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: shrq %cl, %r9
+; FALLBACK4-NEXT: addq %rbx, %rbx
+; FALLBACK4-NEXT: movl %esi, %ecx
+; FALLBACK4-NEXT: shlq %cl, %rbx
+; FALLBACK4-NEXT: orq %r9, %rbx
+; FALLBACK4-NEXT: movl %eax, %ecx
+; FALLBACK4-NEXT: sarq %cl, %rdi
+; FALLBACK4-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK4-NEXT: movq %r12, 48(%rdx)
+; FALLBACK4-NEXT: movq %r14, 32(%rdx)
+; FALLBACK4-NEXT: movq %r15, 40(%rdx)
+; FALLBACK4-NEXT: movq %r10, 16(%rdx)
+; FALLBACK4-NEXT: movq %r11, 24(%rdx)
+; FALLBACK4-NEXT: movq %r8, (%rdx)
+; FALLBACK4-NEXT: addq $8, %rsp
+; FALLBACK4-NEXT: popq %rbx
+; FALLBACK4-NEXT: popq %r12
+; FALLBACK4-NEXT: popq %r13
+; FALLBACK4-NEXT: popq %r14
+; FALLBACK4-NEXT: popq %r15
+; FALLBACK4-NEXT: popq %rbp
+; FALLBACK4-NEXT: retq
+;
+; FALLBACK5-LABEL: ashr_64bytes:
+; FALLBACK5: # %bb.0:
+; FALLBACK5-NEXT: pushq %r15
+; FALLBACK5-NEXT: pushq %r14
+; FALLBACK5-NEXT: pushq %rbx
+; FALLBACK5-NEXT: movups (%rdi), %xmm0
+; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK5-NEXT: movq 48(%rdi), %rcx
+; FALLBACK5-NEXT: movq 56(%rdi), %rdi
+; FALLBACK5-NEXT: movl (%rsi), %eax
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: sarq $63, %rdi
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK5-NEXT: leal (,%rax,8), %ecx
+; FALLBACK5-NEXT: andl $56, %ecx
+; FALLBACK5-NEXT: andl $56, %eax
+; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq %r9, %rsi
+; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK5-NEXT: movq %r10, %r8
+; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK5-NEXT: movq %r11, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK5-NEXT: movq %rax, %r15
+; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK5-NEXT: sarq %cl, %r11
+; FALLBACK5-NEXT: movq %r15, 8(%rdx)
+; FALLBACK5-NEXT: movq %r9, 48(%rdx)
+; FALLBACK5-NEXT: movq %r11, 56(%rdx)
+; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK5-NEXT: movq %r8, 16(%rdx)
+; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK5-NEXT: movq %r14, (%rdx)
+; FALLBACK5-NEXT: popq %rbx
+; FALLBACK5-NEXT: popq %r14
+; FALLBACK5-NEXT: popq %r15
+; FALLBACK5-NEXT: retq
+;
+; FALLBACK6-LABEL: ashr_64bytes:
+; FALLBACK6: # %bb.0:
+; FALLBACK6-NEXT: pushq %rbp
+; FALLBACK6-NEXT: pushq %r15
+; FALLBACK6-NEXT: pushq %r14
+; FALLBACK6-NEXT: pushq %r13
+; FALLBACK6-NEXT: pushq %r12
+; FALLBACK6-NEXT: pushq %rbx
+; FALLBACK6-NEXT: pushq %rax
+; FALLBACK6-NEXT: movups (%rdi), %xmm0
+; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK6-NEXT: movq 48(%rdi), %rcx
+; FALLBACK6-NEXT: movq 56(%rdi), %rdi
+; FALLBACK6-NEXT: movl (%rsi), %eax
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: sarq $63, %rdi
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK6-NEXT: leal (,%rax,8), %esi
+; FALLBACK6-NEXT: andl $56, %esi
+; FALLBACK6-NEXT: andl $56, %eax
+; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK6-NEXT: movl %esi, %ebx
+; FALLBACK6-NEXT: notb %bl
+; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK6-NEXT: orq %r11, %r8
+; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK6-NEXT: orq %r12, %r11
+; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi
+; FALLBACK6-NEXT: addq %rdi, %rdi
+; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT: orq %r9, %rdi
+; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT: orq %r14, %r9
+; FALLBACK6-NEXT: addq %r10, %r10
+; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT: orq %r15, %r10
+; FALLBACK6-NEXT: addq %rax, %rax
+; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT: orq %r13, %rax
+; FALLBACK6-NEXT: addq %rcx, %rcx
+; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT: orq %rbp, %rcx
+; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK6-NEXT: movq %rax, 48(%rdx)
+; FALLBACK6-NEXT: movq %r10, 32(%rdx)
+; FALLBACK6-NEXT: movq %r9, 40(%rdx)
+; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK6-NEXT: movq %r11, 24(%rdx)
+; FALLBACK6-NEXT: movq %r8, (%rdx)
+; FALLBACK6-NEXT: addq $8, %rsp
+; FALLBACK6-NEXT: popq %rbx
+; FALLBACK6-NEXT: popq %r12
+; FALLBACK6-NEXT: popq %r13
+; FALLBACK6-NEXT: popq %r14
+; FALLBACK6-NEXT: popq %r15
+; FALLBACK6-NEXT: popq %rbp
+; FALLBACK6-NEXT: retq
+;
+; FALLBACK7-LABEL: ashr_64bytes:
+; FALLBACK7: # %bb.0:
+; FALLBACK7-NEXT: pushq %r15
+; FALLBACK7-NEXT: pushq %r14
+; FALLBACK7-NEXT: pushq %rbx
+; FALLBACK7-NEXT: movups (%rdi), %xmm0
+; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
+; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
+; FALLBACK7-NEXT: movq 48(%rdi), %rcx
+; FALLBACK7-NEXT: movq 56(%rdi), %rdi
+; FALLBACK7-NEXT: movl (%rsi), %eax
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: sarq $63, %rdi
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK7-NEXT: leal (,%rax,8), %ecx
+; FALLBACK7-NEXT: andl $56, %ecx
+; FALLBACK7-NEXT: andl $56, %eax
+; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq %r9, %rsi
+; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK7-NEXT: movq %r10, %r8
+; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK7-NEXT: movq %r11, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK7-NEXT: movq %rax, %r15
+; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10
+; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT: movq %r15, 8(%rdx)
+; FALLBACK7-NEXT: movq %r9, 48(%rdx)
+; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK7-NEXT: movq %r8, 16(%rdx)
+; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK7-NEXT: movq %r14, (%rdx)
+; FALLBACK7-NEXT: movq %r10, 56(%rdx)
+; FALLBACK7-NEXT: popq %rbx
+; FALLBACK7-NEXT: popq %r14
+; FALLBACK7-NEXT: popq %r15
+; FALLBACK7-NEXT: retq
+;
+; FALLBACK8-LABEL: ashr_64bytes:
+; FALLBACK8: # %bb.0:
+; FALLBACK8-NEXT: pushq %rbp
+; FALLBACK8-NEXT: pushq %r15
+; FALLBACK8-NEXT: pushq %r14
+; FALLBACK8-NEXT: pushq %r13
+; FALLBACK8-NEXT: pushq %r12
+; FALLBACK8-NEXT: pushq %rbx
+; FALLBACK8-NEXT: pushq %rax
+; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK8-NEXT: movq 48(%rdi), %rax
+; FALLBACK8-NEXT: movq 56(%rdi), %rcx
+; FALLBACK8-NEXT: movl (%rsi), %edi
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: sarq $63, %rcx
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK8-NEXT: leal (,%rdi,8), %eax
+; FALLBACK8-NEXT: andl $56, %eax
+; FALLBACK8-NEXT: andl $56, %edi
+; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r10
+; FALLBACK8-NEXT: movl %eax, %esi
+; FALLBACK8-NEXT: notb %sil
+; FALLBACK8-NEXT: leaq (%r9,%r9), %r8
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r8
+; FALLBACK8-NEXT: orq %r10, %r8
+; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK8-NEXT: movq %r10, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbx
+; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r11
+; FALLBACK8-NEXT: orq %rbx, %r11
+; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK8-NEXT: movq %rbx, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r14
+; FALLBACK8-NEXT: addq %r10, %r10
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r10
+; FALLBACK8-NEXT: orq %r14, %r10
+; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK8-NEXT: movq %r14, %r13
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r13
+; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r15
+; FALLBACK8-NEXT: orq %r13, %r15
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r12
+; FALLBACK8-NEXT: addq %r14, %r14
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r14
+; FALLBACK8-NEXT: orq %r12, %r14
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %rbp
+; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %r12
+; FALLBACK8-NEXT: orq %rbp, %r12
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: shrq %cl, %r9
+; FALLBACK8-NEXT: addq %rbx, %rbx
+; FALLBACK8-NEXT: movl %esi, %ecx
+; FALLBACK8-NEXT: shlq %cl, %rbx
+; FALLBACK8-NEXT: orq %r9, %rbx
+; FALLBACK8-NEXT: movl %eax, %ecx
+; FALLBACK8-NEXT: sarq %cl, %rdi
+; FALLBACK8-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK8-NEXT: movq %r12, 48(%rdx)
+; FALLBACK8-NEXT: movq %r14, 32(%rdx)
+; FALLBACK8-NEXT: movq %r15, 40(%rdx)
+; FALLBACK8-NEXT: movq %r10, 16(%rdx)
+; FALLBACK8-NEXT: movq %r11, 24(%rdx)
+; FALLBACK8-NEXT: movq %r8, (%rdx)
+; FALLBACK8-NEXT: addq $8, %rsp
+; FALLBACK8-NEXT: popq %rbx
+; FALLBACK8-NEXT: popq %r12
+; FALLBACK8-NEXT: popq %r13
+; FALLBACK8-NEXT: popq %r14
+; FALLBACK8-NEXT: popq %r15
+; FALLBACK8-NEXT: popq %rbp
+; FALLBACK8-NEXT: vzeroupper
+; FALLBACK8-NEXT: retq
+;
+; FALLBACK9-LABEL: ashr_64bytes:
+; FALLBACK9: # %bb.0:
+; FALLBACK9-NEXT: pushq %r15
+; FALLBACK9-NEXT: pushq %r14
+; FALLBACK9-NEXT: pushq %rbx
+; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK9-NEXT: movq 48(%rdi), %rcx
+; FALLBACK9-NEXT: movq 56(%rdi), %rdi
+; FALLBACK9-NEXT: movl (%rsi), %eax
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: sarq $63, %rdi
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK9-NEXT: leal (,%rax,8), %ecx
+; FALLBACK9-NEXT: andl $56, %ecx
+; FALLBACK9-NEXT: andl $56, %eax
+; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq %r9, %rsi
+; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT: movq %r10, %r8
+; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT: movq %r11, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK9-NEXT: movq %rax, %r15
+; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK9-NEXT: sarq %cl, %r11
+; FALLBACK9-NEXT: movq %r15, 8(%rdx)
+; FALLBACK9-NEXT: movq %r9, 48(%rdx)
+; FALLBACK9-NEXT: movq %r11, 56(%rdx)
+; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK9-NEXT: movq %r8, 16(%rdx)
+; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT: movq %r14, (%rdx)
+; FALLBACK9-NEXT: popq %rbx
+; FALLBACK9-NEXT: popq %r14
+; FALLBACK9-NEXT: popq %r15
+; FALLBACK9-NEXT: vzeroupper
+; FALLBACK9-NEXT: retq
+;
+; FALLBACK10-LABEL: ashr_64bytes:
+; FALLBACK10: # %bb.0:
+; FALLBACK10-NEXT: pushq %rbp
+; FALLBACK10-NEXT: pushq %r15
+; FALLBACK10-NEXT: pushq %r14
+; FALLBACK10-NEXT: pushq %r13
+; FALLBACK10-NEXT: pushq %r12
+; FALLBACK10-NEXT: pushq %rbx
+; FALLBACK10-NEXT: pushq %rax
+; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK10-NEXT: movq 48(%rdi), %rcx
+; FALLBACK10-NEXT: movq 56(%rdi), %rdi
+; FALLBACK10-NEXT: movl (%rsi), %eax
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: sarq $63, %rdi
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK10-NEXT: leal (,%rax,8), %esi
+; FALLBACK10-NEXT: andl $56, %esi
+; FALLBACK10-NEXT: andl $56, %eax
+; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK10-NEXT: movl %esi, %ebx
+; FALLBACK10-NEXT: notb %bl
+; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK10-NEXT: orq %r11, %r8
+; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK10-NEXT: orq %r12, %r11
+; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi
+; FALLBACK10-NEXT: addq %rdi, %rdi
+; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT: orq %r9, %rdi
+; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT: orq %r14, %r9
+; FALLBACK10-NEXT: addq %r10, %r10
+; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT: orq %r15, %r10
+; FALLBACK10-NEXT: addq %rax, %rax
+; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT: orq %r13, %rax
+; FALLBACK10-NEXT: addq %rcx, %rcx
+; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT: orq %rbp, %rcx
+; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK10-NEXT: movq %rax, 48(%rdx)
+; FALLBACK10-NEXT: movq %r10, 32(%rdx)
+; FALLBACK10-NEXT: movq %r9, 40(%rdx)
+; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK10-NEXT: movq %r11, 24(%rdx)
+; FALLBACK10-NEXT: movq %r8, (%rdx)
+; FALLBACK10-NEXT: addq $8, %rsp
+; FALLBACK10-NEXT: popq %rbx
+; FALLBACK10-NEXT: popq %r12
+; FALLBACK10-NEXT: popq %r13
+; FALLBACK10-NEXT: popq %r14
+; FALLBACK10-NEXT: popq %r15
+; FALLBACK10-NEXT: popq %rbp
+; FALLBACK10-NEXT: vzeroupper
+; FALLBACK10-NEXT: retq
+;
+; FALLBACK11-LABEL: ashr_64bytes:
+; FALLBACK11: # %bb.0:
+; FALLBACK11-NEXT: pushq %r15
+; FALLBACK11-NEXT: pushq %r14
+; FALLBACK11-NEXT: pushq %rbx
+; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK11-NEXT: movq 48(%rdi), %rcx
+; FALLBACK11-NEXT: movq 56(%rdi), %rdi
+; FALLBACK11-NEXT: movl (%rsi), %eax
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: sarq $63, %rdi
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK11-NEXT: leal (,%rax,8), %ecx
+; FALLBACK11-NEXT: andl $56, %ecx
+; FALLBACK11-NEXT: andl $56, %eax
+; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq %r9, %rsi
+; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT: movq %r10, %r8
+; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT: movq %r11, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK11-NEXT: movq %rax, %r15
+; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10
+; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT: movq %r15, 8(%rdx)
+; FALLBACK11-NEXT: movq %r9, 48(%rdx)
+; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK11-NEXT: movq %r8, 16(%rdx)
+; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT: movq %r14, (%rdx)
+; FALLBACK11-NEXT: movq %r10, 56(%rdx)
+; FALLBACK11-NEXT: popq %rbx
+; FALLBACK11-NEXT: popq %r14
+; FALLBACK11-NEXT: popq %r15
+; FALLBACK11-NEXT: vzeroupper
+; FALLBACK11-NEXT: retq
+;
+; FALLBACK12-LABEL: ashr_64bytes:
+; FALLBACK12: # %bb.0:
+; FALLBACK12-NEXT: pushq %rbp
+; FALLBACK12-NEXT: pushq %r15
+; FALLBACK12-NEXT: pushq %r14
+; FALLBACK12-NEXT: pushq %r13
+; FALLBACK12-NEXT: pushq %r12
+; FALLBACK12-NEXT: pushq %rbx
+; FALLBACK12-NEXT: pushq %rax
+; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK12-NEXT: movq 48(%rdi), %rax
+; FALLBACK12-NEXT: movq 56(%rdi), %rcx
+; FALLBACK12-NEXT: movl (%rsi), %edi
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: sarq $63, %rcx
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK12-NEXT: leal (,%rdi,8), %eax
+; FALLBACK12-NEXT: andl $56, %eax
+; FALLBACK12-NEXT: andl $56, %edi
+; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10
+; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r10
+; FALLBACK12-NEXT: movl %eax, %esi
+; FALLBACK12-NEXT: notb %sil
+; FALLBACK12-NEXT: leaq (%r9,%r9), %r8
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r8
+; FALLBACK12-NEXT: orq %r10, %r8
+; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10
+; FALLBACK12-NEXT: movq %r10, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbx
+; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12
+; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r11
+; FALLBACK12-NEXT: orq %rbx, %r11
+; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx
+; FALLBACK12-NEXT: movq %rbx, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r14
+; FALLBACK12-NEXT: addq %r10, %r10
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r10
+; FALLBACK12-NEXT: orq %r14, %r10
+; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14
+; FALLBACK12-NEXT: movq %r14, %r13
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r13
+; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp
+; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r15
+; FALLBACK12-NEXT: orq %r13, %r15
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r12
+; FALLBACK12-NEXT: addq %r14, %r14
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r14
+; FALLBACK12-NEXT: orq %r12, %r14
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %rbp
+; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi
+; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %r12
+; FALLBACK12-NEXT: orq %rbp, %r12
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: shrq %cl, %r9
+; FALLBACK12-NEXT: addq %rbx, %rbx
+; FALLBACK12-NEXT: movl %esi, %ecx
+; FALLBACK12-NEXT: shlq %cl, %rbx
+; FALLBACK12-NEXT: orq %r9, %rbx
+; FALLBACK12-NEXT: movl %eax, %ecx
+; FALLBACK12-NEXT: sarq %cl, %rdi
+; FALLBACK12-NEXT: movq %rdi, 56(%rdx)
+; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
+; FALLBACK12-NEXT: movq %r12, 48(%rdx)
+; FALLBACK12-NEXT: movq %r14, 32(%rdx)
+; FALLBACK12-NEXT: movq %r15, 40(%rdx)
+; FALLBACK12-NEXT: movq %r10, 16(%rdx)
+; FALLBACK12-NEXT: movq %r11, 24(%rdx)
+; FALLBACK12-NEXT: movq %r8, (%rdx)
+; FALLBACK12-NEXT: addq $8, %rsp
+; FALLBACK12-NEXT: popq %rbx
+; FALLBACK12-NEXT: popq %r12
+; FALLBACK12-NEXT: popq %r13
+; FALLBACK12-NEXT: popq %r14
+; FALLBACK12-NEXT: popq %r15
+; FALLBACK12-NEXT: popq %rbp
+; FALLBACK12-NEXT: vzeroupper
+; FALLBACK12-NEXT: retq
+;
+; FALLBACK13-LABEL: ashr_64bytes:
+; FALLBACK13: # %bb.0:
+; FALLBACK13-NEXT: pushq %r15
+; FALLBACK13-NEXT: pushq %r14
+; FALLBACK13-NEXT: pushq %rbx
+; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK13-NEXT: movq 48(%rdi), %rcx
+; FALLBACK13-NEXT: movq 56(%rdi), %rdi
+; FALLBACK13-NEXT: movl (%rsi), %eax
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: sarq $63, %rdi
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK13-NEXT: leal (,%rax,8), %ecx
+; FALLBACK13-NEXT: andl $56, %ecx
+; FALLBACK13-NEXT: andl $56, %eax
+; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq %r9, %rsi
+; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK13-NEXT: movq %r10, %r8
+; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK13-NEXT: movq %r11, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK13-NEXT: movq %rax, %r15
+; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK13-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK13-NEXT: sarq %cl, %r11
+; FALLBACK13-NEXT: movq %r15, 8(%rdx)
+; FALLBACK13-NEXT: movq %r9, 48(%rdx)
+; FALLBACK13-NEXT: movq %r11, 56(%rdx)
+; FALLBACK13-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK13-NEXT: movq %r8, 16(%rdx)
+; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK13-NEXT: movq %r14, (%rdx)
+; FALLBACK13-NEXT: popq %rbx
+; FALLBACK13-NEXT: popq %r14
+; FALLBACK13-NEXT: popq %r15
+; FALLBACK13-NEXT: vzeroupper
+; FALLBACK13-NEXT: retq
+;
+; FALLBACK14-LABEL: ashr_64bytes:
+; FALLBACK14: # %bb.0:
+; FALLBACK14-NEXT: pushq %rbp
+; FALLBACK14-NEXT: pushq %r15
+; FALLBACK14-NEXT: pushq %r14
+; FALLBACK14-NEXT: pushq %r13
+; FALLBACK14-NEXT: pushq %r12
+; FALLBACK14-NEXT: pushq %rbx
+; FALLBACK14-NEXT: pushq %rax
+; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK14-NEXT: movq 48(%rdi), %rcx
+; FALLBACK14-NEXT: movq 56(%rdi), %rdi
+; FALLBACK14-NEXT: movl (%rsi), %eax
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: sarq $63, %rdi
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK14-NEXT: leal (,%rax,8), %esi
+; FALLBACK14-NEXT: andl $56, %esi
+; FALLBACK14-NEXT: andl $56, %eax
+; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx
+; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi
+; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12
+; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13
+; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9
+; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10
+; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14
+; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15
+; FALLBACK14-NEXT: movl %esi, %ebx
+; FALLBACK14-NEXT: notb %bl
+; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp
+; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
+; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
+; FALLBACK14-NEXT: orq %r11, %r8
+; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
+; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
+; FALLBACK14-NEXT: orq %r12, %r11
+; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12
+; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13
+; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp
+; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax
+; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi
+; FALLBACK14-NEXT: addq %rdi, %rdi
+; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT: orq %r9, %rdi
+; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
+; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT: orq %r14, %r9
+; FALLBACK14-NEXT: addq %r10, %r10
+; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT: orq %r15, %r10
+; FALLBACK14-NEXT: addq %rax, %rax
+; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT: orq %r13, %rax
+; FALLBACK14-NEXT: addq %rcx, %rcx
+; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx
+; FALLBACK14-NEXT: orq %rbp, %rcx
+; FALLBACK14-NEXT: movq %rsi, 56(%rdx)
+; FALLBACK14-NEXT: movq %rcx, 8(%rdx)
+; FALLBACK14-NEXT: movq %rax, 48(%rdx)
+; FALLBACK14-NEXT: movq %r10, 32(%rdx)
+; FALLBACK14-NEXT: movq %r9, 40(%rdx)
+; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
+; FALLBACK14-NEXT: movq %r11, 24(%rdx)
+; FALLBACK14-NEXT: movq %r8, (%rdx)
+; FALLBACK14-NEXT: addq $8, %rsp
+; FALLBACK14-NEXT: popq %rbx
+; FALLBACK14-NEXT: popq %r12
+; FALLBACK14-NEXT: popq %r13
+; FALLBACK14-NEXT: popq %r14
+; FALLBACK14-NEXT: popq %r15
+; FALLBACK14-NEXT: popq %rbp
+; FALLBACK14-NEXT: vzeroupper
+; FALLBACK14-NEXT: retq
+;
+; FALLBACK15-LABEL: ashr_64bytes:
+; FALLBACK15: # %bb.0:
+; FALLBACK15-NEXT: pushq %r15
+; FALLBACK15-NEXT: pushq %r14
+; FALLBACK15-NEXT: pushq %rbx
+; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
+; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1
+; FALLBACK15-NEXT: movq 48(%rdi), %rcx
+; FALLBACK15-NEXT: movq 56(%rdi), %rdi
+; FALLBACK15-NEXT: movl (%rsi), %eax
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: sarq $63, %rdi
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; FALLBACK15-NEXT: leal (,%rax,8), %ecx
+; FALLBACK15-NEXT: andl $56, %ecx
+; FALLBACK15-NEXT: andl $56, %eax
+; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
+; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq %r9, %rsi
+; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
+; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT: movq %r10, %r8
+; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
+; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
+; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT: movq %r11, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
+; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
+; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
+; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
+; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
+; FALLBACK15-NEXT: movq %rax, %r15
+; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
+; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10
+; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
+; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT: movq %r15, 8(%rdx)
+; FALLBACK15-NEXT: movq %r9, 48(%rdx)
+; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
+; FALLBACK15-NEXT: movq %r8, 16(%rdx)
+; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT: movq %r14, (%rdx)
+; FALLBACK15-NEXT: movq %r10, 56(%rdx)
+; FALLBACK15-NEXT: popq %rbx
+; FALLBACK15-NEXT: popq %r14
+; FALLBACK15-NEXT: popq %r15
+; FALLBACK15-NEXT: vzeroupper
+; FALLBACK15-NEXT: retq
+;
+; FALLBACK16-LABEL: ashr_64bytes:
+; FALLBACK16: # %bb.0:
+; FALLBACK16-NEXT: pushl %ebp
+; FALLBACK16-NEXT: pushl %ebx
+; FALLBACK16-NEXT: pushl %edi
+; FALLBACK16-NEXT: pushl %esi
+; FALLBACK16-NEXT: subl $204, %esp
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT: movl (%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 4(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 8(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 12(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 16(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 20(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 24(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 28(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 32(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 36(%ecx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 40(%ecx), %ebx
+; FALLBACK16-NEXT: movl 44(%ecx), %edi
+; FALLBACK16-NEXT: movl 48(%ecx), %esi
+; FALLBACK16-NEXT: movl 52(%ecx), %edx
+; FALLBACK16-NEXT: movl 56(%ecx), %eax
+; FALLBACK16-NEXT: movl 60(%ecx), %ecx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT: movl (%ebp), %ebp
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: sarl $31, %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT: movl %ebp, %ecx
+; FALLBACK16-NEXT: movl %ebp, %esi
+; FALLBACK16-NEXT: andl $60, %esi
+; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK16-NEXT: shll $3, %ecx
+; FALLBACK16-NEXT: andl $24, %ecx
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: movl %ecx, %ebx
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: notb %ch
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %eax, %edi
+; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %edx, %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %esi, %edx
+; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edx
+; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi
+; FALLBACK16-NEXT: leal (%edi,%edi), %eax
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %eax
+; FALLBACK16-NEXT: orl %edx, %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %ebp, %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %esi, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax
+; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi
+; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebx, %edx
+; FALLBACK16-NEXT: movb %dl, %cl
+; FALLBACK16-NEXT: shrl %cl, %edi
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT: addl %ebx, %ebx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebx
+; FALLBACK16-NEXT: orl %edi, %ebx
+; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi
+; FALLBACK16-NEXT: movl %edi, %eax
+; FALLBACK16-NEXT: movl %edx, %ebx
+; FALLBACK16-NEXT: movl %ebx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx
+; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movl %ebp, %edx
+; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: shrl %cl, %esi
+; FALLBACK16-NEXT: addl %edi, %edi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edi
+; FALLBACK16-NEXT: orl %esi, %edi
+; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi
+; FALLBACK16-NEXT: movl %esi, %eax
+; FALLBACK16-NEXT: movl %ebx, %ecx
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx
+; FALLBACK16-NEXT: leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %ebp
+; FALLBACK16-NEXT: orl %eax, %ebp
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: addl %esi, %esi
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %esi
+; FALLBACK16-NEXT: orl %eax, %esi
+; FALLBACK16-NEXT: movb %bl, %cl
+; FALLBACK16-NEXT: movl %edx, %eax
+; FALLBACK16-NEXT: shrl %cl, %eax
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT: movb %ch, %cl
+; FALLBACK16-NEXT: shll %cl, %edx
+; FALLBACK16-NEXT: orl %eax, %edx
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT: sarl %cl, %ebx
+; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT: movl %ebx, 60(%eax)
+; FALLBACK16-NEXT: movl %edx, 56(%eax)
+; FALLBACK16-NEXT: movl %esi, 48(%eax)
+; FALLBACK16-NEXT: movl %ebp, 52(%eax)
+; FALLBACK16-NEXT: movl %edi, 40(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 44(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 32(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 36(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 24(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 28(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 16(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 20(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 8(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 12(%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, (%eax)
+; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT: movl %ecx, 4(%eax)
+; FALLBACK16-NEXT: addl $204, %esp
+; FALLBACK16-NEXT: popl %esi
+; FALLBACK16-NEXT: popl %edi
+; FALLBACK16-NEXT: popl %ebx
+; FALLBACK16-NEXT: popl %ebp
+; FALLBACK16-NEXT: retl
+;
+; FALLBACK17-LABEL: ashr_64bytes:
+; FALLBACK17: # %bb.0:
+; FALLBACK17-NEXT: pushl %ebp
+; FALLBACK17-NEXT: pushl %ebx
+; FALLBACK17-NEXT: pushl %edi
+; FALLBACK17-NEXT: pushl %esi
+; FALLBACK17-NEXT: subl $188, %esp
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT: movl (%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 4(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 8(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 12(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 16(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 20(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 24(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 28(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 32(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 36(%eax), %ecx
+; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 40(%eax), %ebp
+; FALLBACK17-NEXT: movl 44(%eax), %ebx
+; FALLBACK17-NEXT: movl 48(%eax), %edi
+; FALLBACK17-NEXT: movl 52(%eax), %esi
+; FALLBACK17-NEXT: movl 56(%eax), %edx
+; FALLBACK17-NEXT: movl 60(%eax), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK17-NEXT: movl (%ecx), %ecx
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: sarl $31, %eax
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
+; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shll $3, %ecx
+; FALLBACK17-NEXT: andl $24, %ecx
+; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %esi
+; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edx
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %edx
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl %eax, %edi
+; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK17-NEXT: sarl %cl, %eax
+; FALLBACK17-NEXT: movl %eax, 60(%ebp)
+; FALLBACK17-NEXT: movl %esi, 48(%ebp)
+; FALLBACK17-NEXT: movl %edi, 52(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
+; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT: movl %eax, 4(%ebp)
+; FALLBACK17-NEXT: addl $188, %esp
+; FALLBACK17-NEXT: popl %esi
+; FALLBACK17-NEXT: popl %edi
+; FALLBACK17-NEXT: popl %ebx
+; FALLBACK17-NEXT: popl %ebp
+; FALLBACK17-NEXT: retl
+;
+; FALLBACK18-LABEL: ashr_64bytes:
+; FALLBACK18: # %bb.0:
+; FALLBACK18-NEXT: pushl %ebp
+; FALLBACK18-NEXT: pushl %ebx
+; FALLBACK18-NEXT: pushl %edi
+; FALLBACK18-NEXT: pushl %esi
+; FALLBACK18-NEXT: subl $204, %esp
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 4(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 8(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 12(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 16(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 20(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 24(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 28(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 32(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 36(%eax), %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 40(%eax), %ebp
+; FALLBACK18-NEXT: movl 44(%eax), %ebx
+; FALLBACK18-NEXT: movl 48(%eax), %edi
+; FALLBACK18-NEXT: movl 52(%eax), %esi
+; FALLBACK18-NEXT: movl 56(%eax), %edx
+; FALLBACK18-NEXT: movl 60(%eax), %ecx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl (%eax), %eax
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: sarl $31, %ecx
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT: movl %eax, %ecx
+; FALLBACK18-NEXT: leal (,%eax,8), %edx
+; FALLBACK18-NEXT: andl $24, %edx
+; FALLBACK18-NEXT: andl $60, %ecx
+; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl %edx, %ebx
+; FALLBACK18-NEXT: notb %bl
+; FALLBACK18-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: orl %edi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK18-NEXT: leal (%esi,%esi), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT: orl %eax, %edi
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK18-NEXT: addl %edi, %edi
+; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT: orl %esi, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: leal (%eax,%eax), %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK18-NEXT: movl %ecx, %edi
+; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK18-NEXT: orl %ebp, %eax
+; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT: addl %esi, %esi
+; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK18-NEXT: orl %ecx, %esi
+; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK18-NEXT: orl %edi, %ecx
+; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT: addl %eax, %eax
+; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK18-NEXT: addl %ebp, %ebp
+; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK18-NEXT: orl %eax, %ebx
+; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK18-NEXT: movl %edx, 60(%eax)
+; FALLBACK18-NEXT: movl %ebx, 56(%eax)
+; FALLBACK18-NEXT: movl %edi, 48(%eax)
+; FALLBACK18-NEXT: movl %ecx, 52(%eax)
+; FALLBACK18-NEXT: movl %esi, 40(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 44(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 32(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 36(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 24(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 28(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 16(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 20(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 8(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 12(%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, (%eax)
+; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT: movl %ecx, 4(%eax)
+; FALLBACK18-NEXT: addl $204, %esp
+; FALLBACK18-NEXT: popl %esi
+; FALLBACK18-NEXT: popl %edi
+; FALLBACK18-NEXT: popl %ebx
+; FALLBACK18-NEXT: popl %ebp
+; FALLBACK18-NEXT: retl
+;
+; FALLBACK19-LABEL: ashr_64bytes:
+; FALLBACK19: # %bb.0:
+; FALLBACK19-NEXT: pushl %ebp
+; FALLBACK19-NEXT: pushl %ebx
+; FALLBACK19-NEXT: pushl %edi
+; FALLBACK19-NEXT: pushl %esi
+; FALLBACK19-NEXT: subl $188, %esp
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT: movl (%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 4(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 8(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 12(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 16(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 20(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 24(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 28(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 32(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 36(%eax), %ecx
+; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 40(%eax), %ebp
+; FALLBACK19-NEXT: movl 44(%eax), %ebx
+; FALLBACK19-NEXT: movl 48(%eax), %edi
+; FALLBACK19-NEXT: movl 52(%eax), %esi
+; FALLBACK19-NEXT: movl 56(%eax), %edx
+; FALLBACK19-NEXT: movl 60(%eax), %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK19-NEXT: movl (%ecx), %ecx
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: sarl $31, %eax
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT: movl %ecx, %ebp
+; FALLBACK19-NEXT: andl $60, %ebp
+; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shll $3, %ecx
+; FALLBACK19-NEXT: andl $24, %ecx
+; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %esi
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl %edi, %edx
+; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK19-NEXT: movl %eax, 56(%ebp)
+; FALLBACK19-NEXT: movl %esi, 48(%ebp)
+; FALLBACK19-NEXT: movl %edx, 52(%ebp)
+; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 44(%ebp)
+; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 32(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 36(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 24(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 28(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 16(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 20(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 8(%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK19-NEXT: movl %eax, 12(%ebp)
+; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK19-NEXT: movl %edi, (%ebp)
+; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK19-NEXT: movl %eax, 60(%ebp)
+; FALLBACK19-NEXT: addl $188, %esp
+; FALLBACK19-NEXT: popl %esi
+; FALLBACK19-NEXT: popl %edi
+; FALLBACK19-NEXT: popl %ebx
+; FALLBACK19-NEXT: popl %ebp
+; FALLBACK19-NEXT: retl
+;
+; FALLBACK20-LABEL: ashr_64bytes:
+; FALLBACK20: # %bb.0:
+; FALLBACK20-NEXT: pushl %ebp
+; FALLBACK20-NEXT: pushl %ebx
+; FALLBACK20-NEXT: pushl %edi
+; FALLBACK20-NEXT: pushl %esi
+; FALLBACK20-NEXT: subl $204, %esp
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT: movups (%ecx), %xmm0
+; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK20-NEXT: movl 48(%ecx), %edx
+; FALLBACK20-NEXT: movl 52(%ecx), %esi
+; FALLBACK20-NEXT: movl 56(%ecx), %edi
+; FALLBACK20-NEXT: movl 60(%ecx), %ecx
+; FALLBACK20-NEXT: movl (%eax), %eax
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: sarl $31, %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT: movl %eax, %esi
+; FALLBACK20-NEXT: andl $60, %esi
+; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK20-NEXT: shll $3, %eax
+; FALLBACK20-NEXT: andl $24, %eax
+; FALLBACK20-NEXT: movl %edx, %edi
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb %al, %ch
+; FALLBACK20-NEXT: notb %ch
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %edi, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK20-NEXT: movl %edx, %ebp
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %edx, %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %ebx, %edx
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movl %eax, %edx
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: addl %eax, %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT: leal (%edi,%edi), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %eax, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK20-NEXT: movl %ebx, %ebp
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK20-NEXT: leal (%edx,%edx), %eax
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %eax
+; FALLBACK20-NEXT: orl %ebp, %eax
+; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edi
+; FALLBACK20-NEXT: addl %ebx, %ebx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %edi, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK20-NEXT: movl %edi, %ebp
+; FALLBACK20-NEXT: movl %eax, %ecx
+; FALLBACK20-NEXT: shrl %cl, %ebp
+; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebx
+; FALLBACK20-NEXT: orl %ebp, %ebx
+; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %edx
+; FALLBACK20-NEXT: addl %edi, %edi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edi
+; FALLBACK20-NEXT: orl %edx, %edi
+; FALLBACK20-NEXT: movl %esi, %edx
+; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK20-NEXT: movl %esi, %ebx
+; FALLBACK20-NEXT: movb %al, %cl
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %ebp
+; FALLBACK20-NEXT: orl %ebx, %ebp
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT: shrl %cl, %ebx
+; FALLBACK20-NEXT: addl %esi, %esi
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %esi
+; FALLBACK20-NEXT: orl %ebx, %esi
+; FALLBACK20-NEXT: movb %dl, %cl
+; FALLBACK20-NEXT: shrl %cl, %eax
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT: movb %ch, %cl
+; FALLBACK20-NEXT: shll %cl, %edx
+; FALLBACK20-NEXT: orl %eax, %edx
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT: sarl %cl, %ebx
+; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT: movl %ebx, 60(%eax)
+; FALLBACK20-NEXT: movl %edx, 56(%eax)
+; FALLBACK20-NEXT: movl %esi, 48(%eax)
+; FALLBACK20-NEXT: movl %ebp, 52(%eax)
+; FALLBACK20-NEXT: movl %edi, 40(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 44(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 32(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 36(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 24(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 28(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 16(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 20(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 8(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 12(%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, (%eax)
+; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT: movl %ecx, 4(%eax)
+; FALLBACK20-NEXT: addl $204, %esp
+; FALLBACK20-NEXT: popl %esi
+; FALLBACK20-NEXT: popl %edi
+; FALLBACK20-NEXT: popl %ebx
+; FALLBACK20-NEXT: popl %ebp
+; FALLBACK20-NEXT: retl
+;
+; FALLBACK21-LABEL: ashr_64bytes:
+; FALLBACK21: # %bb.0:
+; FALLBACK21-NEXT: pushl %ebp
+; FALLBACK21-NEXT: pushl %ebx
+; FALLBACK21-NEXT: pushl %edi
+; FALLBACK21-NEXT: pushl %esi
+; FALLBACK21-NEXT: subl $188, %esp
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT: movups (%eax), %xmm0
+; FALLBACK21-NEXT: movups 16(%eax), %xmm1
+; FALLBACK21-NEXT: movups 32(%eax), %xmm2
+; FALLBACK21-NEXT: movl 48(%eax), %edx
+; FALLBACK21-NEXT: movl 52(%eax), %esi
+; FALLBACK21-NEXT: movl 56(%eax), %edi
+; FALLBACK21-NEXT: movl 60(%eax), %eax
+; FALLBACK21-NEXT: movl (%ecx), %ecx
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: sarl $31, %eax
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
+; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shll $3, %ecx
+; FALLBACK21-NEXT: andl $24, %ecx
+; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %esi
+; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edx
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %edx
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl %eax, %edi
+; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT: sarl %cl, %eax
+; FALLBACK21-NEXT: movl %eax, 60(%ebp)
+; FALLBACK21-NEXT: movl %esi, 48(%ebp)
+; FALLBACK21-NEXT: movl %edi, 52(%ebp)
+; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK21-NEXT: movl %eax, 4(%ebp)
+; FALLBACK21-NEXT: addl $188, %esp
+; FALLBACK21-NEXT: popl %esi
+; FALLBACK21-NEXT: popl %edi
+; FALLBACK21-NEXT: popl %ebx
+; FALLBACK21-NEXT: popl %ebp
+; FALLBACK21-NEXT: retl
+;
+; FALLBACK22-LABEL: ashr_64bytes:
+; FALLBACK22: # %bb.0:
+; FALLBACK22-NEXT: pushl %ebp
+; FALLBACK22-NEXT: pushl %ebx
+; FALLBACK22-NEXT: pushl %edi
+; FALLBACK22-NEXT: pushl %esi
+; FALLBACK22-NEXT: subl $204, %esp
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK22-NEXT: movups (%ecx), %xmm0
+; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
+; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
+; FALLBACK22-NEXT: movl 48(%ecx), %edx
+; FALLBACK22-NEXT: movl 52(%ecx), %esi
+; FALLBACK22-NEXT: movl 56(%ecx), %edi
+; FALLBACK22-NEXT: movl 60(%ecx), %ecx
+; FALLBACK22-NEXT: movl (%eax), %eax
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: sarl $31, %ecx
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK22-NEXT: movl %eax, %ecx
+; FALLBACK22-NEXT: leal (,%eax,8), %edx
+; FALLBACK22-NEXT: andl $24, %edx
+; FALLBACK22-NEXT: andl $60, %ecx
+; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl %edx, %ebx
+; FALLBACK22-NEXT: notb %bl
+; FALLBACK22-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: orl %edi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT: leal (%esi,%esi), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT: orl %eax, %edi
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT: addl %edi, %edi
+; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT: orl %esi, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: leal (%eax,%eax), %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK22-NEXT: movl %ecx, %edi
+; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT: orl %ebp, %eax
+; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT: addl %esi, %esi
+; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK22-NEXT: orl %ecx, %esi
+; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT: orl %edi, %ecx
+; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT: addl %eax, %eax
+; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK22-NEXT: addl %ebp, %ebp
+; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK22-NEXT: orl %eax, %ebx
+; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK22-NEXT: movl %edx, 60(%eax)
+; FALLBACK22-NEXT: movl %ebx, 56(%eax)
+; FALLBACK22-NEXT: movl %edi, 48(%eax)
+; FALLBACK22-NEXT: movl %ecx, 52(%eax)
+; FALLBACK22-NEXT: movl %esi, 40(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 44(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 32(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 36(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 24(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 28(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 16(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 20(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 8(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 12(%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, (%eax)
+; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT: movl %ecx, 4(%eax)
+; FALLBACK22-NEXT: addl $204, %esp
+; FALLBACK22-NEXT: popl %esi
+; FALLBACK22-NEXT: popl %edi
+; FALLBACK22-NEXT: popl %ebx
+; FALLBACK22-NEXT: popl %ebp
+; FALLBACK22-NEXT: retl
+;
+; FALLBACK23-LABEL: ashr_64bytes:
+; FALLBACK23: # %bb.0:
+; FALLBACK23-NEXT: pushl %ebp
+; FALLBACK23-NEXT: pushl %ebx
+; FALLBACK23-NEXT: pushl %edi
+; FALLBACK23-NEXT: pushl %esi
+; FALLBACK23-NEXT: subl $188, %esp
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK23-NEXT: movups (%eax), %xmm0
+; FALLBACK23-NEXT: movups 16(%eax), %xmm1
+; FALLBACK23-NEXT: movups 32(%eax), %xmm2
+; FALLBACK23-NEXT: movl 48(%eax), %edx
+; FALLBACK23-NEXT: movl 52(%eax), %esi
+; FALLBACK23-NEXT: movl 56(%eax), %edi
+; FALLBACK23-NEXT: movl 60(%eax), %eax
+; FALLBACK23-NEXT: movl (%ecx), %ecx
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: sarl $31, %eax
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK23-NEXT: movl %ecx, %ebp
+; FALLBACK23-NEXT: andl $60, %ebp
+; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shll $3, %ecx
+; FALLBACK23-NEXT: andl $24, %ecx
+; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %esi
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl %edi, %edx
+; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK23-NEXT: movl %eax, 56(%ebp)
+; FALLBACK23-NEXT: movl %esi, 48(%ebp)
+; FALLBACK23-NEXT: movl %edx, 52(%ebp)
+; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 44(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 32(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 36(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 24(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 28(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 16(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 20(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 8(%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK23-NEXT: movl %eax, 12(%ebp)
+; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK23-NEXT: movl %edi, (%ebp)
+; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK23-NEXT: movl %eax, 60(%ebp)
+; FALLBACK23-NEXT: addl $188, %esp
+; FALLBACK23-NEXT: popl %esi
+; FALLBACK23-NEXT: popl %edi
+; FALLBACK23-NEXT: popl %ebx
+; FALLBACK23-NEXT: popl %ebp
+; FALLBACK23-NEXT: retl
+;
+; FALLBACK24-LABEL: ashr_64bytes:
+; FALLBACK24: # %bb.0:
+; FALLBACK24-NEXT: pushl %ebp
+; FALLBACK24-NEXT: pushl %ebx
+; FALLBACK24-NEXT: pushl %edi
+; FALLBACK24-NEXT: pushl %esi
+; FALLBACK24-NEXT: subl $204, %esp
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK24-NEXT: movl 48(%ecx), %edx
+; FALLBACK24-NEXT: movl 52(%ecx), %esi
+; FALLBACK24-NEXT: movl 56(%ecx), %edi
+; FALLBACK24-NEXT: movl 60(%ecx), %ecx
+; FALLBACK24-NEXT: movl (%eax), %eax
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: sarl $31, %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT: movl %eax, %esi
+; FALLBACK24-NEXT: andl $60, %esi
+; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT: shll $3, %eax
+; FALLBACK24-NEXT: andl $24, %eax
+; FALLBACK24-NEXT: movl %edx, %edi
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb %al, %ch
+; FALLBACK24-NEXT: notb %ch
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %edi, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK24-NEXT: movl %edx, %ebp
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %edx, %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %ebx, %edx
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movl %eax, %edx
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: addl %eax, %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT: leal (%edi,%edi), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %eax, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK24-NEXT: movl %ebx, %ebp
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK24-NEXT: leal (%edx,%edx), %eax
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %eax
+; FALLBACK24-NEXT: orl %ebp, %eax
+; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edi
+; FALLBACK24-NEXT: addl %ebx, %ebx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %edi, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK24-NEXT: movl %edi, %ebp
+; FALLBACK24-NEXT: movl %eax, %ecx
+; FALLBACK24-NEXT: shrl %cl, %ebp
+; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebx
+; FALLBACK24-NEXT: orl %ebp, %ebx
+; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %edx
+; FALLBACK24-NEXT: addl %edi, %edi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edi
+; FALLBACK24-NEXT: orl %edx, %edi
+; FALLBACK24-NEXT: movl %esi, %edx
+; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK24-NEXT: movl %esi, %ebx
+; FALLBACK24-NEXT: movb %al, %cl
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %ebp
+; FALLBACK24-NEXT: orl %ebx, %ebp
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT: shrl %cl, %ebx
+; FALLBACK24-NEXT: addl %esi, %esi
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %esi
+; FALLBACK24-NEXT: orl %ebx, %esi
+; FALLBACK24-NEXT: movb %dl, %cl
+; FALLBACK24-NEXT: shrl %cl, %eax
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT: movb %ch, %cl
+; FALLBACK24-NEXT: shll %cl, %edx
+; FALLBACK24-NEXT: orl %eax, %edx
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT: sarl %cl, %ebx
+; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT: movl %ebx, 60(%eax)
+; FALLBACK24-NEXT: movl %edx, 56(%eax)
+; FALLBACK24-NEXT: movl %esi, 48(%eax)
+; FALLBACK24-NEXT: movl %ebp, 52(%eax)
+; FALLBACK24-NEXT: movl %edi, 40(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 44(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 32(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 36(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 24(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 28(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 16(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 20(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 8(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 12(%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, (%eax)
+; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT: movl %ecx, 4(%eax)
+; FALLBACK24-NEXT: addl $204, %esp
+; FALLBACK24-NEXT: popl %esi
+; FALLBACK24-NEXT: popl %edi
+; FALLBACK24-NEXT: popl %ebx
+; FALLBACK24-NEXT: popl %ebp
+; FALLBACK24-NEXT: vzeroupper
+; FALLBACK24-NEXT: retl
+;
+; FALLBACK25-LABEL: ashr_64bytes:
+; FALLBACK25: # %bb.0:
+; FALLBACK25-NEXT: pushl %ebp
+; FALLBACK25-NEXT: pushl %ebx
+; FALLBACK25-NEXT: pushl %edi
+; FALLBACK25-NEXT: pushl %esi
+; FALLBACK25-NEXT: subl $188, %esp
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT: vmovups (%eax), %ymm0
+; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK25-NEXT: movl 48(%eax), %edx
+; FALLBACK25-NEXT: movl 52(%eax), %esi
+; FALLBACK25-NEXT: movl 56(%eax), %edi
+; FALLBACK25-NEXT: movl 60(%eax), %eax
+; FALLBACK25-NEXT: movl (%ecx), %ecx
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: sarl $31, %eax
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
+; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shll $3, %ecx
+; FALLBACK25-NEXT: andl $24, %ecx
+; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %esi
+; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edx
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %edx
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl %eax, %edi
+; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK25-NEXT: sarl %cl, %eax
+; FALLBACK25-NEXT: movl %eax, 60(%ebp)
+; FALLBACK25-NEXT: movl %esi, 48(%ebp)
+; FALLBACK25-NEXT: movl %edi, 52(%ebp)
+; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK25-NEXT: movl %eax, 4(%ebp)
+; FALLBACK25-NEXT: addl $188, %esp
+; FALLBACK25-NEXT: popl %esi
+; FALLBACK25-NEXT: popl %edi
+; FALLBACK25-NEXT: popl %ebx
+; FALLBACK25-NEXT: popl %ebp
+; FALLBACK25-NEXT: vzeroupper
+; FALLBACK25-NEXT: retl
+;
+; FALLBACK26-LABEL: ashr_64bytes:
+; FALLBACK26: # %bb.0:
+; FALLBACK26-NEXT: pushl %ebp
+; FALLBACK26-NEXT: pushl %ebx
+; FALLBACK26-NEXT: pushl %edi
+; FALLBACK26-NEXT: pushl %esi
+; FALLBACK26-NEXT: subl $204, %esp
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK26-NEXT: movl 48(%ecx), %edx
+; FALLBACK26-NEXT: movl 52(%ecx), %esi
+; FALLBACK26-NEXT: movl 56(%ecx), %edi
+; FALLBACK26-NEXT: movl 60(%ecx), %ecx
+; FALLBACK26-NEXT: movl (%eax), %eax
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: sarl $31, %ecx
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK26-NEXT: movl %eax, %ecx
+; FALLBACK26-NEXT: leal (,%eax,8), %edx
+; FALLBACK26-NEXT: andl $24, %edx
+; FALLBACK26-NEXT: andl $60, %ecx
+; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl %edx, %ebx
+; FALLBACK26-NEXT: notb %bl
+; FALLBACK26-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: orl %edi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK26-NEXT: leal (%esi,%esi), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT: orl %eax, %edi
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK26-NEXT: addl %edi, %edi
+; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT: orl %esi, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: leal (%eax,%eax), %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK26-NEXT: movl %ecx, %edi
+; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK26-NEXT: orl %ebp, %eax
+; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT: addl %esi, %esi
+; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK26-NEXT: orl %ecx, %esi
+; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK26-NEXT: orl %edi, %ecx
+; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT: addl %eax, %eax
+; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK26-NEXT: addl %ebp, %ebp
+; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK26-NEXT: orl %eax, %ebx
+; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK26-NEXT: movl %edx, 60(%eax)
+; FALLBACK26-NEXT: movl %ebx, 56(%eax)
+; FALLBACK26-NEXT: movl %edi, 48(%eax)
+; FALLBACK26-NEXT: movl %ecx, 52(%eax)
+; FALLBACK26-NEXT: movl %esi, 40(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 44(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 32(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 36(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 24(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 28(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 16(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 20(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 8(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 12(%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, (%eax)
+; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT: movl %ecx, 4(%eax)
+; FALLBACK26-NEXT: addl $204, %esp
+; FALLBACK26-NEXT: popl %esi
+; FALLBACK26-NEXT: popl %edi
+; FALLBACK26-NEXT: popl %ebx
+; FALLBACK26-NEXT: popl %ebp
+; FALLBACK26-NEXT: vzeroupper
+; FALLBACK26-NEXT: retl
+;
+; FALLBACK27-LABEL: ashr_64bytes:
+; FALLBACK27: # %bb.0:
+; FALLBACK27-NEXT: pushl %ebp
+; FALLBACK27-NEXT: pushl %ebx
+; FALLBACK27-NEXT: pushl %edi
+; FALLBACK27-NEXT: pushl %esi
+; FALLBACK27-NEXT: subl $188, %esp
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK27-NEXT: vmovups (%eax), %ymm0
+; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK27-NEXT: movl 48(%eax), %edx
+; FALLBACK27-NEXT: movl 52(%eax), %esi
+; FALLBACK27-NEXT: movl 56(%eax), %edi
+; FALLBACK27-NEXT: movl 60(%eax), %eax
+; FALLBACK27-NEXT: movl (%ecx), %ecx
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: sarl $31, %eax
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK27-NEXT: movl %ecx, %ebp
+; FALLBACK27-NEXT: andl $60, %ebp
+; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shll $3, %ecx
+; FALLBACK27-NEXT: andl $24, %ecx
+; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %esi
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl %edi, %edx
+; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK27-NEXT: movl %eax, 56(%ebp)
+; FALLBACK27-NEXT: movl %esi, 48(%ebp)
+; FALLBACK27-NEXT: movl %edx, 52(%ebp)
+; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 44(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 32(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 36(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 24(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 28(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 16(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 20(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 8(%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK27-NEXT: movl %eax, 12(%ebp)
+; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK27-NEXT: movl %edi, (%ebp)
+; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK27-NEXT: movl %eax, 60(%ebp)
+; FALLBACK27-NEXT: addl $188, %esp
+; FALLBACK27-NEXT: popl %esi
+; FALLBACK27-NEXT: popl %edi
+; FALLBACK27-NEXT: popl %ebx
+; FALLBACK27-NEXT: popl %ebp
+; FALLBACK27-NEXT: vzeroupper
+; FALLBACK27-NEXT: retl
+;
+; FALLBACK28-LABEL: ashr_64bytes:
+; FALLBACK28: # %bb.0:
+; FALLBACK28-NEXT: pushl %ebp
+; FALLBACK28-NEXT: pushl %ebx
+; FALLBACK28-NEXT: pushl %edi
+; FALLBACK28-NEXT: pushl %esi
+; FALLBACK28-NEXT: subl $204, %esp
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK28-NEXT: movl 48(%ecx), %edx
+; FALLBACK28-NEXT: movl 52(%ecx), %esi
+; FALLBACK28-NEXT: movl 56(%ecx), %edi
+; FALLBACK28-NEXT: movl 60(%ecx), %ecx
+; FALLBACK28-NEXT: movl (%eax), %eax
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: sarl $31, %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT: movl %eax, %esi
+; FALLBACK28-NEXT: andl $60, %esi
+; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT: shll $3, %eax
+; FALLBACK28-NEXT: andl $24, %eax
+; FALLBACK28-NEXT: movl %edx, %edi
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb %al, %ch
+; FALLBACK28-NEXT: notb %ch
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %edi, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
+; FALLBACK28-NEXT: movl %edx, %ebp
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %edx, %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %ebx, %edx
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movl %eax, %edx
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: addl %eax, %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT: leal (%edi,%edi), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %eax, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
+; FALLBACK28-NEXT: movl %ebx, %ebp
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
+; FALLBACK28-NEXT: leal (%edx,%edx), %eax
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %eax
+; FALLBACK28-NEXT: orl %ebp, %eax
+; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edi
+; FALLBACK28-NEXT: addl %ebx, %ebx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %edi, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
+; FALLBACK28-NEXT: movl %edi, %ebp
+; FALLBACK28-NEXT: movl %eax, %ecx
+; FALLBACK28-NEXT: shrl %cl, %ebp
+; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebx
+; FALLBACK28-NEXT: orl %ebp, %ebx
+; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %edx
+; FALLBACK28-NEXT: addl %edi, %edi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edi
+; FALLBACK28-NEXT: orl %edx, %edi
+; FALLBACK28-NEXT: movl %esi, %edx
+; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
+; FALLBACK28-NEXT: movl %esi, %ebx
+; FALLBACK28-NEXT: movb %al, %cl
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %ebp
+; FALLBACK28-NEXT: orl %ebx, %ebp
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT: shrl %cl, %ebx
+; FALLBACK28-NEXT: addl %esi, %esi
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %esi
+; FALLBACK28-NEXT: orl %ebx, %esi
+; FALLBACK28-NEXT: movb %dl, %cl
+; FALLBACK28-NEXT: shrl %cl, %eax
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
+; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT: movb %ch, %cl
+; FALLBACK28-NEXT: shll %cl, %edx
+; FALLBACK28-NEXT: orl %eax, %edx
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT: sarl %cl, %ebx
+; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT: movl %ebx, 60(%eax)
+; FALLBACK28-NEXT: movl %edx, 56(%eax)
+; FALLBACK28-NEXT: movl %esi, 48(%eax)
+; FALLBACK28-NEXT: movl %ebp, 52(%eax)
+; FALLBACK28-NEXT: movl %edi, 40(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 44(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 32(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 36(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 24(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 28(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 16(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 20(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 8(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 12(%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, (%eax)
+; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT: movl %ecx, 4(%eax)
+; FALLBACK28-NEXT: addl $204, %esp
+; FALLBACK28-NEXT: popl %esi
+; FALLBACK28-NEXT: popl %edi
+; FALLBACK28-NEXT: popl %ebx
+; FALLBACK28-NEXT: popl %ebp
+; FALLBACK28-NEXT: vzeroupper
+; FALLBACK28-NEXT: retl
+;
+; FALLBACK29-LABEL: ashr_64bytes:
+; FALLBACK29: # %bb.0:
+; FALLBACK29-NEXT: pushl %ebp
+; FALLBACK29-NEXT: pushl %ebx
+; FALLBACK29-NEXT: pushl %edi
+; FALLBACK29-NEXT: pushl %esi
+; FALLBACK29-NEXT: subl $188, %esp
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT: vmovups (%eax), %ymm0
+; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK29-NEXT: movl 48(%eax), %edx
+; FALLBACK29-NEXT: movl 52(%eax), %esi
+; FALLBACK29-NEXT: movl 56(%eax), %edi
+; FALLBACK29-NEXT: movl 60(%eax), %eax
+; FALLBACK29-NEXT: movl (%ecx), %ecx
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: sarl $31, %eax
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
+; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shll $3, %ecx
+; FALLBACK29-NEXT: andl $24, %ecx
+; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %esi
+; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edx
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %edx
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
+; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
+; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl %eax, %edi
+; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
+; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
+; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK29-NEXT: sarl %cl, %eax
+; FALLBACK29-NEXT: movl %eax, 60(%ebp)
+; FALLBACK29-NEXT: movl %esi, 48(%ebp)
+; FALLBACK29-NEXT: movl %edi, 52(%ebp)
+; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK29-NEXT: movl %eax, 4(%ebp)
+; FALLBACK29-NEXT: addl $188, %esp
+; FALLBACK29-NEXT: popl %esi
+; FALLBACK29-NEXT: popl %edi
+; FALLBACK29-NEXT: popl %ebx
+; FALLBACK29-NEXT: popl %ebp
+; FALLBACK29-NEXT: vzeroupper
+; FALLBACK29-NEXT: retl
+;
+; FALLBACK30-LABEL: ashr_64bytes:
+; FALLBACK30: # %bb.0:
+; FALLBACK30-NEXT: pushl %ebp
+; FALLBACK30-NEXT: pushl %ebx
+; FALLBACK30-NEXT: pushl %edi
+; FALLBACK30-NEXT: pushl %esi
+; FALLBACK30-NEXT: subl $204, %esp
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
+; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1
+; FALLBACK30-NEXT: movl 48(%ecx), %edx
+; FALLBACK30-NEXT: movl 52(%ecx), %esi
+; FALLBACK30-NEXT: movl 56(%ecx), %edi
+; FALLBACK30-NEXT: movl 60(%ecx), %ecx
+; FALLBACK30-NEXT: movl (%eax), %eax
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: sarl $31, %ecx
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK30-NEXT: movl %eax, %ecx
+; FALLBACK30-NEXT: leal (,%eax,8), %edx
+; FALLBACK30-NEXT: andl $24, %edx
+; FALLBACK30-NEXT: andl $60, %ecx
+; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl %edx, %ebx
+; FALLBACK30-NEXT: notb %bl
+; FALLBACK30-NEXT: leal (%edi,%edi), %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: orl %edi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi
+; FALLBACK30-NEXT: leal (%esi,%esi), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT: orl %eax, %edi
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi
+; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
+; FALLBACK30-NEXT: addl %edi, %edi
+; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT: orl %esi, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: leal (%eax,%eax), %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi
+; FALLBACK30-NEXT: movl %ecx, %edi
+; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
+; FALLBACK30-NEXT: orl %ebp, %eax
+; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT: addl %esi, %esi
+; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
+; FALLBACK30-NEXT: orl %ecx, %esi
+; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp
+; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx
+; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax
+; FALLBACK30-NEXT: shrxl %edx, %eax, %edi
+; FALLBACK30-NEXT: orl %edi, %ecx
+; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT: addl %eax, %eax
+; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
+; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp
+; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx
+; FALLBACK30-NEXT: addl %ebp, %ebp
+; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx
+; FALLBACK30-NEXT: orl %eax, %ebx
+; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK30-NEXT: movl %edx, 60(%eax)
+; FALLBACK30-NEXT: movl %ebx, 56(%eax)
+; FALLBACK30-NEXT: movl %edi, 48(%eax)
+; FALLBACK30-NEXT: movl %ecx, 52(%eax)
+; FALLBACK30-NEXT: movl %esi, 40(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 44(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 32(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 36(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 24(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 28(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 16(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 20(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 8(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 12(%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, (%eax)
+; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT: movl %ecx, 4(%eax)
+; FALLBACK30-NEXT: addl $204, %esp
+; FALLBACK30-NEXT: popl %esi
+; FALLBACK30-NEXT: popl %edi
+; FALLBACK30-NEXT: popl %ebx
+; FALLBACK30-NEXT: popl %ebp
+; FALLBACK30-NEXT: vzeroupper
+; FALLBACK30-NEXT: retl
+;
+; FALLBACK31-LABEL: ashr_64bytes:
+; FALLBACK31: # %bb.0:
+; FALLBACK31-NEXT: pushl %ebp
+; FALLBACK31-NEXT: pushl %ebx
+; FALLBACK31-NEXT: pushl %edi
+; FALLBACK31-NEXT: pushl %esi
+; FALLBACK31-NEXT: subl $188, %esp
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FALLBACK31-NEXT: vmovups (%eax), %ymm0
+; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1
+; FALLBACK31-NEXT: movl 48(%eax), %edx
+; FALLBACK31-NEXT: movl 52(%eax), %esi
+; FALLBACK31-NEXT: movl 56(%eax), %edi
+; FALLBACK31-NEXT: movl 60(%eax), %eax
+; FALLBACK31-NEXT: movl (%ecx), %ecx
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: sarl $31, %eax
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK31-NEXT: movl %ecx, %ebp
+; FALLBACK31-NEXT: andl $60, %ebp
+; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
+; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shll $3, %ecx
+; FALLBACK31-NEXT: andl $24, %ecx
+; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
+; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %esi
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
+; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
+; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
+; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
+; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
+; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
+; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
+; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl %edi, %edx
+; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
+; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
+; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
+; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
+; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
+; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK31-NEXT: movl %eax, 56(%ebp)
+; FALLBACK31-NEXT: movl %esi, 48(%ebp)
+; FALLBACK31-NEXT: movl %edx, 52(%ebp)
+; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 44(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 32(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 36(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 24(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 28(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 16(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 20(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 8(%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK31-NEXT: movl %eax, 12(%ebp)
+; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
+; FALLBACK31-NEXT: movl %edi, (%ebp)
+; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
+; FALLBACK31-NEXT: movl %eax, 60(%ebp)
+; FALLBACK31-NEXT: addl $188, %esp
+; FALLBACK31-NEXT: popl %esi
+; FALLBACK31-NEXT: popl %edi
+; FALLBACK31-NEXT: popl %ebx
+; FALLBACK31-NEXT: popl %ebp
+; FALLBACK31-NEXT: vzeroupper
+; FALLBACK31-NEXT: retl
+ %src = load i512, ptr %src.ptr, align 1
+ %byteOff = load i512, ptr %byteOff.ptr, align 1
+ %bitOff = shl i512 %byteOff, 3
+ %res = ashr i512 %src, %bitOff
+ store i512 %res, ptr %dst, align 1
+ ret void
+}
+
+define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
+; X64-SSE2-LABEL: ashr_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rbx
; X64-SSE2-NEXT: movq (%rdi), %rax
@@ -2394,15 +24296,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $63, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
+; X64-SSE2-NEXT: andl $7, %esi
+; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
@@ -2414,8 +24316,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE2-NEXT: popq %rbx
; X64-SSE2-NEXT: retq
;
-; X64-SSE42-LABEL: ashr_64bytes:
+; X64-SSE42-LABEL: ashr_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rax
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
@@ -2424,9 +24327,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movl (%rsi), %esi
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: sarq $63, %rcx
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -2436,19 +24339,21 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: andl $63, %esi
-; X64-SSE42-NEXT: movups -128(%rsp,%rsi), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3
+; X64-SSE42-NEXT: andl $7, %esi
+; X64-SSE42-NEXT: movups -128(%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT: movups -112(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT: movups -96(%rsp,%rsi,8), %xmm2
+; X64-SSE42-NEXT: movups -80(%rsp,%rsi,8), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: ashr_64bytes:
+; X64-AVX-LABEL: ashr_64bytes_qwordOff:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rax
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX-NEXT: movq 48(%rdi), %rax
@@ -2456,7 +24361,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX-NEXT: movl (%rsi), %esi
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: sarq $63, %rcx
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -2467,25 +24372,26 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $63, %esi
-; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0
-; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1
-; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2
-; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3
+; X64-AVX-NEXT: andl $7, %esi
+; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
+; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT: popq %rax
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
-; X86-SSE2-LABEL: ashr_64bytes:
+; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: pushl %ebx
; X86-SSE2-NEXT: pushl %edi
; X86-SSE2-NEXT: pushl %esi
-; X86-SSE2-NEXT: subl $168, %esp
+; X86-SSE2-NEXT: subl $188, %esp
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl (%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2506,7 +24412,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl 32(%eax), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 36(%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SSE2-NEXT: movl 40(%eax), %ebp
; X86-SSE2-NEXT: movl 44(%eax), %ebx
; X86-SSE2-NEXT: movl 48(%eax), %edi
@@ -2520,7 +24426,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2558,33 +24464,33 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT: andl $63, %eax
-; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
+; X86-SSE2-NEXT: andl $7, %eax
+; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx
+; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
-; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
-; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
-; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
-; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
-; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
-; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
+; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp
+; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx
+; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi
+; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi
+; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx
+; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl %ecx, 56(%eax)
; X86-SSE2-NEXT: movl %edx, 60(%eax)
@@ -2592,7 +24498,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %edi, 52(%eax)
; X86-SSE2-NEXT: movl %ebx, 40(%eax)
; X86-SSE2-NEXT: movl %ebp, 44(%eax)
-; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 32(%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 36(%eax)
@@ -2612,14 +24518,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE2-NEXT: movl %ecx, (%eax)
; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-SSE2-NEXT: movl %ecx, 4(%eax)
-; X86-SSE2-NEXT: addl $168, %esp
+; X86-SSE2-NEXT: addl $188, %esp
; X86-SSE2-NEXT: popl %esi
; X86-SSE2-NEXT: popl %edi
; X86-SSE2-NEXT: popl %ebx
; X86-SSE2-NEXT: popl %ebp
; X86-SSE2-NEXT: retl
;
-; X86-SSE42-LABEL: ashr_64bytes:
+; X86-SSE42-LABEL: ashr_64bytes_qwordOff:
; X86-SSE42: # %bb.0:
; X86-SSE42-NEXT: pushl %ebx
; X86-SSE42-NEXT: pushl %edi
@@ -2640,9 +24546,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: movups %xmm0, (%esp)
+; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SSE42-NEXT: movaps %xmm0, (%esp)
; X86-SSE42-NEXT: sarl $31, %edx
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2660,11 +24566,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-SSE42-NEXT: andl $63, %ecx
-; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
-; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
-; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
-; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
+; X86-SSE42-NEXT: andl $7, %ecx
+; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0
+; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1
+; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2
+; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3
; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
@@ -2675,7 +24581,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_64bytes:
+; X86-AVX-LABEL: ashr_64bytes_qwordOff:
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: pushl %ebx
; X86-AVX-NEXT: pushl %edi
@@ -2695,7 +24601,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: vmovups %ymm0, (%esp)
; X86-AVX-NEXT: sarl $31, %edx
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2714,11 +24620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $63, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3
+; X86-AVX-NEXT: andl $7, %ecx
+; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
; X86-AVX-NEXT: vmovups %xmm3, 48(%eax)
; X86-AVX-NEXT: vmovups %xmm2, 32(%eax)
; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
@@ -2730,45 +24636,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-AVX-NEXT: vzeroupper
; X86-AVX-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
- %byteOff = load i512, ptr %byteOff.ptr, align 1
- %bitOff = shl i512 %byteOff, 3
+ %qwordOff = load i512, ptr %qwordOff.ptr, align 1
+ %bitOff = shl i512 %qwordOff, 6
%res = ashr i512 %src, %bitOff
store i512 %res, ptr %dst, align 1
ret void
}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; FALLBACK0: {{.*}}
-; FALLBACK1: {{.*}}
-; FALLBACK10: {{.*}}
-; FALLBACK11: {{.*}}
-; FALLBACK12: {{.*}}
-; FALLBACK13: {{.*}}
-; FALLBACK14: {{.*}}
-; FALLBACK15: {{.*}}
-; FALLBACK16: {{.*}}
-; FALLBACK17: {{.*}}
-; FALLBACK18: {{.*}}
-; FALLBACK19: {{.*}}
-; FALLBACK2: {{.*}}
-; FALLBACK20: {{.*}}
-; FALLBACK21: {{.*}}
-; FALLBACK22: {{.*}}
-; FALLBACK23: {{.*}}
-; FALLBACK24: {{.*}}
-; FALLBACK25: {{.*}}
-; FALLBACK26: {{.*}}
-; FALLBACK27: {{.*}}
-; FALLBACK28: {{.*}}
-; FALLBACK29: {{.*}}
-; FALLBACK3: {{.*}}
-; FALLBACK30: {{.*}}
-; FALLBACK31: {{.*}}
-; FALLBACK4: {{.*}}
-; FALLBACK5: {{.*}}
-; FALLBACK6: {{.*}}
-; FALLBACK7: {{.*}}
-; FALLBACK8: {{.*}}
-; FALLBACK9: {{.*}}
; X64: {{.*}}
; X86: {{.*}}
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index f84131dfc879..8c0873492ce4 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -588,61 +588,58 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ah
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -655,50 +652,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -711,51 +697,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -768,47 +752,40 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -899,66 +876,62 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -967,58 +940,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes:
@@ -1027,34 +987,32 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
@@ -1072,7 +1030,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1081,57 +1039,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%bitOff = load i128, ptr %bitOff.ptr, align 1
@@ -1218,62 +1164,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1286,51 +1231,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1343,52 +1279,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1401,48 +1337,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -1459,35 +1390,34 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
@@ -1496,142 +1426,124 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1640,127 +1552,120 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi,4), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1775,95 +1680,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
@@ -1879,103 +1756,95 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -1988,92 +1857,73 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2089,31 +1939,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: negb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %sil, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -2146,79 +1996,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r8, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax
@@ -2226,50 +2067,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2278,118 +2109,112 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: negb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2398,7 +2223,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2413,99 +2238,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $28, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
@@ -2519,106 +2315,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -2631,95 +2426,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $28, %al
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -2735,36 +2510,36 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
@@ -2773,145 +2548,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -2920,17 +2680,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
@@ -2942,7 +2702,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
@@ -2953,95 +2713,94 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3088,64 +2847,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
@@ -3161,106 +2897,101 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -3273,93 +3004,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -3381,6 +3098,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3390,6 +3108,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3398,18 +3121,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
@@ -3417,7 +3132,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
@@ -3426,7 +3140,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -3478,6 +3192,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3488,22 +3203,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -3511,73 +3228,41 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3588,6 +3273,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3597,6 +3283,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3606,52 +3297,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
@@ -3662,10 +3344,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3676,11 +3359,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
@@ -3691,6 +3371,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
@@ -3700,60 +3385,39 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -3762,40 +3426,44 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3806,8 +3474,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3816,214 +3483,199 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -4036,209 +3688,153 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4252,42 +3848,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -4297,6 +3897,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4307,163 +3908,141 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
@@ -4478,7 +4057,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4489,7 +4068,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
@@ -4499,7 +4078,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
@@ -4508,13 +4087,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -4522,9 +4105,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -4534,138 +4118,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -4680,7 +4216,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
@@ -4695,6 +4230,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -4703,107 +4243,91 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r14), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r14), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r14), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r14), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
@@ -4815,7 +4339,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
@@ -4823,77 +4352,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r10), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r10), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r10), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -4904,6 +4398,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4913,6 +4408,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -4922,68 +4422,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -4994,12 +4484,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
@@ -5009,6 +4496,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -5018,65 +4510,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5085,42 +4552,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5129,6 +4598,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5137,200 +4609,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: subl %ecx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5353,7 +4804,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -5366,213 +4817,153 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%ebp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -5585,50 +4976,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5641,179 +5037,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $216, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -5827,42 +5194,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ebx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ebx), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -5870,6 +5239,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -5882,148 +5254,93 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
@@ -6045,6 +5362,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -6072,9 +5390,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
@@ -6082,7 +5400,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
@@ -6091,7 +5408,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -6143,6 +5460,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
@@ -6153,22 +5471,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
@@ -6176,74 +5491,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, (%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6254,6 +5545,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -6281,44 +5573,43 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
@@ -6329,10 +5620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
@@ -6343,11 +5635,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
@@ -6376,52 +5665,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
@@ -6430,12 +5706,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
@@ -6443,7 +5719,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6452,19 +5728,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6473,7 +5749,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6482,7 +5758,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
@@ -6503,196 +5779,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
@@ -6705,7 +5980,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -6718,7 +5993,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx
@@ -6726,189 +6001,144 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx
@@ -6942,199 +6172,199 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
@@ -7149,7 +6379,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -7158,7 +6388,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx
@@ -7170,173 +6400,142 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 9ae1f270e883..044be12a3954 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -432,30 +432,89 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -505,30 +564,89 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -577,30 +695,89 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -649,32 +826,128 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: movss %xmm0, (%esp)
-; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $32, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $32, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -689,58 +962,123 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzbl -64(%rsp,%rax), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT: movb %al, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -756,58 +1094,136 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -822,58 +1238,136 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movl -64(%rsp,%rax), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: xorps %xmm1, %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -888,60 +1382,191 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movl %ecx, %eax
+; X64-SHLD-NEXT: shrb $6, %al
+; X64-SHLD-NEXT: movzbl %al, %eax
+; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $64, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $64, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -956,70 +1581,288 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-NEXT: movq -56(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, 8(%rdx)
-; X64-NEXT: movq %rcx, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $92, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %eax
+; X86-SHLD-NEXT: shrb $5, %al
+; X86-SHLD-NEXT: movzbl %al, %ebx
+; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $92, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
@@ -1034,84 +1877,155 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
}
define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movzbl -128(%rsp,%rsi), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movb %sil, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movb %cl, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1127,84 +2041,155 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movw %si, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1219,84 +2204,155 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movl -128(%rsp,%rsi), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: andl $56, %ecx
+; X64-NO-BMI2-NEXT: andl $56, %esi
+; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: addl %esi, %esi
+; X64-NO-BMI2-NEXT: notl %ecx
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NEXT: orl %eax, %esi
+; X64-NO-BMI2-NEXT: movl %esi, (%rdx)
+; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: leal (,%rsi,8), %eax
+; X64-BMI2-NEXT: andl $56, %eax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: notl %eax
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-BMI2-NEXT: addl %esi, %esi
+; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $136, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: leal (,%edx,8), %ecx
+; X86-SHLD-NEXT: andl $60, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $136, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1311,86 +2367,216 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rax, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: pushq %rax
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: andl $56, %esi
+; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
+; X64-SHLD-NEXT: movq %rax, (%rdx)
+; X64-SHLD-NEXT: popq %rax
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $128, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %esi
+; X86-SHLD-NEXT: andl $60, %esi
+; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi
+; X86-SHLD-NEXT: movl (%esp,%esi), %edx
+; X86-SHLD-NEXT: movl 4(%esp,%esi), %esi
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: movl %esi, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edx, (%eax)
+; X86-SHLD-NEXT: addl $128, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1405,96 +2591,326 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
}
define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-NEXT: movq %rcx, 8(%rdx)
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $128, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $156, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edi
+; X86-SHLD-NEXT: andl $60, %edi
+; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%edi), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%edi), %ebx
+; X86-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: movl 28(%esp,%edi), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%edi), %edi
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $156, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1509,116 +2925,484 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl $63, %esi
-; X64-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NEXT: movq -120(%rsp,%rsi), %rcx
-; X64-NEXT: movq -112(%rsp,%rsi), %rdi
-; X64-NEXT: movq -104(%rsp,%rsi), %rsi
-; X64-NEXT: movq %rsi, 24(%rdx)
-; X64-NEXT: movq %rdi, 16(%rdx)
-; X64-NEXT: movq %rcx, 8(%rdx)
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $136, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movdqu (%ecx), %xmm0
-; X86-NEXT: movdqu 16(%ecx), %xmm1
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $63, %eax
-; X86-NEXT: movl 8(%esp,%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esp,%eax), %ecx
-; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 16(%esp,%eax), %esi
-; X86-NEXT: movl 20(%esp,%eax), %edi
-; X86-NEXT: movl 24(%esp,%eax), %ebx
-; X86-NEXT: movl 28(%esp,%eax), %ebp
-; X86-NEXT: movl 32(%esp,%eax), %edx
-; X86-NEXT: movl 36(%esp,%eax), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ecx, 28(%eax)
-; X86-NEXT: movl %edx, 24(%eax)
-; X86-NEXT: movl %ebp, 20(%eax)
-; X86-NEXT: movl %ebx, 16(%eax)
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $136, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %r8b
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %r11, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $156, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %edi
+; X86-SHLD-NEXT: andl $60, %edi
+; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx
+; X86-SHLD-NEXT: movl 20(%esp,%edi), %eax
+; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: andl $24, %ecx
+; X86-SHLD-NEXT: movl %eax, %esi
+; X86-SHLD-NEXT: movl %edx, %eax
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: movl 28(%esp,%edi), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %eax
+; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: movl 32(%esp,%edi), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %edx
+; X86-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-SHLD-NEXT: movl 36(%esp,%edi), %esi
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebp
+; X86-SHLD-NEXT: movl 40(%esp,%edi), %edx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl 44(%esp,%edi), %eax
+; X86-SHLD-NEXT: shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT: movl 16(%esp,%edi), %ebx
+; X86-SHLD-NEXT: movl 48(%esp,%edi), %edi
+; X86-SHLD-NEXT: shrdl %cl, %edi, %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-SHLD-NEXT: movl %eax, 28(%edi)
+; X86-SHLD-NEXT: movl %edx, 24(%edi)
+; X86-SHLD-NEXT: movl %esi, 20(%edi)
+; X86-SHLD-NEXT: movl %ebp, 16(%edi)
+; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 12(%edi)
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 8(%edi)
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: movl %eax, 4(%edi)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %eax, %ebx
+; X86-SHLD-NEXT: movl %ebx, (%edi)
+; X86-SHLD-NEXT: addl $156, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
@@ -1633,9 +3417,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
-; X64-HAVE-BMI2-HAVE-SHLD: {{.*}}
-; X64-NO-BMI2-HAVE-SHLD: {{.*}}
+; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
+; X86: {{.*}}
; X86-HAVE-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-BMI2-HAVE-SHLD: {{.*}}
; X86-NO-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 4a47e7613dfa..ff13f4ba577f 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -603,32 +603,86 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_1byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -711,32 +765,86 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_2byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -818,32 +926,86 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_4byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $40, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $40, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -925,34 +1087,125 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
-; X86-LABEL: load_8byte_chunk_of_16byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: andb $15, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: retl
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $32, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $3, %dl
+; X86-SHLD-NEXT: andb $12, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $32, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <16 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <16 x i8> %init
@@ -967,64 +1220,128 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; no @load_16byte_chunk_of_16byte_alloca
define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzbl -64(%rsp,%rax), %eax
-; X64-NEXT: movb %al, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_1byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movzbl (%esp,%ecx), %ecx
-; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shrq %cl, %rax
+; X64-NO-BMI2-NEXT: movb %al, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax
+; X64-BMI2-NEXT: movb %al, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT: movb %bl, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1038,64 +1355,141 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movw %ax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_2byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movw %cx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movw %si, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1108,64 +1502,141 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movl -64(%rsp,%rax), %eax
-; X64-NEXT: movl %eax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_4byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NEXT: shrb $6, %al
+; X64-NO-BMI2-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: addl %eax, %eax
+; X64-NO-BMI2-NEXT: andb $56, %cl
+; X64-NO-BMI2-NEXT: notb %cl
+; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NEXT: orl %esi, %eax
+; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: retq
+;
+; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: movups (%rdi), %xmm0
+; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
+; X64-BMI2-NEXT: shll $3, %esi
+; X64-BMI2-NEXT: xorps %xmm2, %xmm2
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movl %esi, %eax
+; X64-BMI2-NEXT: shrb $6, %al
+; X64-BMI2-NEXT: movzbl %al, %eax
+; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT: andb $56, %sil
+; X64-BMI2-NEXT: notb %sil
+; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: addl %eax, %eax
+; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT: orl %eax, %ecx
+; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $72, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-SHLD-NEXT: movl %esi, (%eax)
+; X86-SHLD-NEXT: addl $72, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1178,66 +1649,197 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_8byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movl %ecx, %eax
+; X64-SHLD-NEXT: shrb $6, %al
+; X64-SHLD-NEXT: movzbl %al, %eax
+; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
+; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %al
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $64, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: movups (%edx), %xmm0
+; X86-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-SHLD-NEXT: movl %ecx, %edx
+; X86-SHLD-NEXT: shrb $5, %dl
+; X86-SHLD-NEXT: movzbl %dl, %edx
+; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi
+; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi
+; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx
+; X86-SHLD-NEXT: movl %edx, %ebx
+; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: shrdl %cl, %edx, %edi
+; X86-SHLD-NEXT: movl %ebx, 4(%eax)
+; X86-SHLD-NEXT: movl %edi, (%eax)
+; X86-SHLD-NEXT: addl $64, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1250,76 +1852,295 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
}
define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
-; X64-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X64: # %bb.0:
-; X64-NEXT: movdqu (%rdi), %xmm0
-; X64-NEXT: movdqu 16(%rdi), %xmm1
-; X64-NEXT: shll $3, %esi
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrb $3, %sil
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-NEXT: movq -56(%rsp,%rax), %rax
-; X64-NEXT: movq %rax, 8(%rdx)
-; X64-NEXT: movq %rcx, (%rdx)
-; X64-NEXT: retq
-;
-; X86-LABEL: load_16byte_chunk_of_32byte_alloca:
-; X86: # %bb.0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movdqu (%edx), %xmm0
-; X86-NEXT: movdqu 16(%edx), %xmm1
-; X86-NEXT: shll $3, %ecx
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm0, (%esp)
-; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrb $3, %cl
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl (%esp,%ecx), %edx
-; X86-NEXT: movl 4(%esp,%ecx), %esi
-; X86-NEXT: movl 8(%esp,%ecx), %edi
-; X86-NEXT: movl 12(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edx, (%eax)
-; X86-NEXT: addl $64, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: retl
+; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
+;
+; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
+;
+; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-NO-BMI2-NO-SHLD: # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT: retl
+;
+; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-SHLD: # %bb.0:
+; X86-SHLD-NEXT: pushl %ebp
+; X86-SHLD-NEXT: pushl %ebx
+; X86-SHLD-NEXT: pushl %edi
+; X86-SHLD-NEXT: pushl %esi
+; X86-SHLD-NEXT: subl $92, %esp
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT: movups (%eax), %xmm0
+; X86-SHLD-NEXT: movups 16(%eax), %xmm1
+; X86-SHLD-NEXT: shll $3, %ecx
+; X86-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-SHLD-NEXT: movl %ecx, %eax
+; X86-SHLD-NEXT: shrb $5, %al
+; X86-SHLD-NEXT: movzbl %al, %ebx
+; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi
+; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax
+; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi
+; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT: shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp
+; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx
+; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp
+; X86-SHLD-NEXT: movl %ebp, 12(%edx)
+; X86-SHLD-NEXT: movl %esi, 8(%edx)
+; X86-SHLD-NEXT: movl %edi, 4(%edx)
+; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SHLD-NEXT: shrdl %cl, %esi, %eax
+; X86-SHLD-NEXT: movl %eax, (%edx)
+; X86-SHLD-NEXT: addl $92, %esp
+; X86-SHLD-NEXT: popl %esi
+; X86-SHLD-NEXT: popl %edi
+; X86-SHLD-NEXT: popl %ebx
+; X86-SHLD-NEXT: popl %ebp
+; X86-SHLD-NEXT: retl
+;
+; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
+; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
%intermediate.val.frozen = freeze <32 x i8> %init
@@ -1334,7 +2155,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; no @load_32byte_chunk_of_32byte_alloca
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
+; X64: {{.*}}
; X64-NO-SHLD: {{.*}}
-; X64-SHLD: {{.*}}
+; X86: {{.*}}
; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}