diff options
| author | Vitaly Buka <vitalybuka@google.com> | 2024-09-23 15:55:29 -0700 |
|---|---|---|
| committer | Vitaly Buka <vitalybuka@google.com> | 2024-09-23 15:55:29 -0700 |
| commit | 80323f174971174928c87fb0e958a6fcfe094d59 (patch) | |
| tree | bb0862b94fc42ba636ea993820a3368b851fd334 /llvm/test/CodeGen | |
| parent | 1c4f36eefcbee84fe801c6817ff4cdc7feeafd13 (diff) | |
| parent | 8dbb739ffb0880e4f739992d07dc6ba6edca9509 (diff) | |
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfchwasan-remove-code-duplication-in-shadowmappinginit
Created using spr 1.3.4
[skip ci]
Diffstat (limited to 'llvm/test/CodeGen')
72 files changed, 46808 insertions, 13820 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir index 3e768c4d7a26..03c28efe7e09 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir @@ -159,25 +159,13 @@ body: | ; CHECK-LABEL: name: test_freeze_v3s8 ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16) - ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s16) - ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s8>) = G_BUILD_VECTOR [[TRUNC]](s8), [[TRUNC1]](s8), [[TRUNC2]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(<8 x s16>) = G_ANYEXT [[BUILD_VECTOR]](<8 x s8>) - ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<4 x s16>), [[UV5:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[ANYEXT]](<8 x s16>) - ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s16>) = G_FREEZE [[UV4]] - ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[FREEZE]](<4 x s16>) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s8>) = G_FREEZE [[DEF]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[FREEZE]](<4 x s8>) ; CHECK-NEXT: %undef:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CHECK-NEXT: %ext0:_(s32) = G_AND [[ANYEXT1]], [[C]] - ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16) - ; CHECK-NEXT: %ext1:_(s32) = G_AND [[ANYEXT2]], [[C]] - ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16) - ; CHECK-NEXT: %ext2:_(s32) = G_AND [[ANYEXT3]], [[C]] + ; CHECK-NEXT: %ext0:_(s32) = G_ZEXT [[UV]](s8) + ; CHECK-NEXT: %ext1:_(s32) = G_ZEXT [[UV1]](s8) + ; CHECK-NEXT: %ext2:_(s32) = G_ZEXT [[UV2]](s8) ; CHECK-NEXT: %res:_(<4 x s32>) = G_BUILD_VECTOR %ext0(s32), %ext1(s32), %ext2(s32), %undef(s32) ; CHECK-NEXT: $q0 = COPY %res(<4 x s32>) %x:_(<3 x s8>) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir index 9a8697c1d9b8..11c6c7fb40fa 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-insert-vector-elt.mir @@ -248,13 +248,10 @@ body: | ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[UV2]](s16) ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s8) = G_TRUNC [[UV3]](s16) ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV4]](s16) - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) - ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s16) - ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s16) - ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s8) = G_TRUNC [[UV8]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[TRUNC6]](s8), [[TRUNC7]](s8), [[TRUNC8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[DEF2]](<4 x s8>) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[TRUNC3]](s8), [[TRUNC4]](s8), [[TRUNC5]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[DEF]](s8), [[DEF]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[UV6]](s8), [[UV7]](s8), [[UV8]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8), [[DEF1]](s8) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s8>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR1]](<16 x s8>), [[BUILD_VECTOR2]], shufflemask(0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, undef, undef, undef, undef) ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[SHUF]](<16 x s8>) ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<4 x s32>) = G_UITOFP [[BITCAST]](<4 x s32>) diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index e90014be21de..b14f1a43b7dc 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -177,9 +177,7 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){ ; ; CHECK-GI-LABEL: bswap_v2i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: rev16 v0.8b, v0.8b ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index 18570b2d793f..eee917e8acb0 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -183,15 +183,12 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) { ; ; CHECK-GI-LABEL: concat_v8s16_v2s16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h1, [x0] -; CHECK-GI-NEXT: ldr h2, [x0, #2] -; CHECK-GI-NEXT: dup v0.4s, w8 -; CHECK-GI-NEXT: mov v1.s[1], v2.s[0] -; CHECK-GI-NEXT: xtn v2.4h, v0.4s -; CHECK-GI-NEXT: xtn v1.4h, v1.4s -; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr h1, [x0, #2] +; CHECK-GI-NEXT: mov v0.s[1], v1.s[0] +; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: mov v0.s[0], w8 -; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: mov v0.s[3], w8 @@ -209,10 +206,7 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) { ; ; CHECK-GI-LABEL: concat_v16s8_v4s8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.8h, w8 -; CHECK-GI-NEXT: xtn v1.8b, v0.8h ; CHECK-GI-NEXT: ldr s0, [x0] -; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: mov v0.s[1], w8 ; CHECK-GI-NEXT: mov v0.s[2], w8 ; CHECK-GI-NEXT: mov v0.s[3], w8 diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index aa20304e52a9..a9618fdc2dec 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -3,24 +3,10 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) { -; CHECK-SD-LABEL: interleave2_v4f16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: interleave2_v4f16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v2.4s, w8 -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: xtn v0.4h, v2.4s -; CHECK-GI-NEXT: mov v1.s[0], w8 -; CHECK-GI-NEXT: mov v2.s[0], w9 -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: zip1 v0.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: interleave2_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1) ret <4 x half> %retval } diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll index 20b5567e973d..f72a49f6ab7c 100644 --- a/llvm/test/CodeGen/AArch64/fptoi.ll +++ b/llvm/test/CodeGen/AArch64/fptoi.ll @@ -3172,42 +3172,22 @@ entry: } define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) { -; CHECK-SD-LABEL: fptos_v3f32_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptos_v3f32_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[2], w9 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptos_v3f32_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptosi <3 x float> %a to <3 x i16> ret <3 x i16> %c } define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) { -; CHECK-SD-LABEL: fptou_v3f32_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptou_v3f32_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[2], w9 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptou_v3f32_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptoui <3 x float> %a to <3 x i16> ret <3 x i16> %c @@ -6077,11 +6057,7 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 -; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i16: @@ -6110,11 +6086,7 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-NOFP16-NEXT: mov w9, v0.s[2] -; CHECK-GI-NOFP16-NEXT: mov v0.h[1], w8 -; CHECK-GI-NOFP16-NEXT: mov v0.h[2], w9 -; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NOFP16-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i16: diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 4ac04798e154..f70ec0f35cb5 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -7450,9 +7450,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: scvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -7493,9 +7491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16: ; CHECK-GI-FP16: // %bb.0: // %entry -; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] @@ -8059,8 +8055,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-FP16-NEXT: movi d1, #0x0000ff000000ff ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-FP16-NEXT: mov w8, v0.s[1] -; CHECK-GI-FP16-NEXT: mov v0.h[1], w8 +; CHECK-GI-FP16-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: ucvtf v0.4h, v0.4h ; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] ; CHECK-GI-FP16-NEXT: mov v0.h[1], v1.h[0] diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll index 7014a4a9acbe..54f7887aee8d 100644 --- a/llvm/test/CodeGen/AArch64/shift.ll +++ b/llvm/test/CodeGen/AArch64/shift.ll @@ -531,26 +531,8 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){ ; ; CHECK-GI-LABEL: shl_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v1.h[1] -; CHECK-GI-NEXT: mov h4, v0.h[2] -; CHECK-GI-NEXT: mov h5, v0.h[3] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[2] -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov h3, v1.h[3] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: mov v1.b[1], w9 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: fmov w9, s2 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: fmov w8, s5 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov v0.b[3], w8 -; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] ; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] @@ -592,12 +574,8 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){ ; ; CHECK-GI-LABEL: shl_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] ; CHECK-GI-NEXT: mov v0.h[1], v1.h[0] @@ -741,26 +719,8 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){ ; ; CHECK-GI-LABEL: ashr_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v1.h[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h3, v0.h[1] -; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[3] -; CHECK-GI-NEXT: fmov w9, s4 -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v1.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov h3, v0.h[2] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v1.b[3], w8 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: neg v1.8b, v1.8b ; CHECK-GI-NEXT: sshl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] @@ -802,12 +762,8 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){ ; ; CHECK-GI-LABEL: ashr_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v1.s[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: neg v1.4h, v1.4h ; CHECK-GI-NEXT: sshl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] @@ -946,26 +902,8 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){ ; ; CHECK-GI-LABEL: lshr_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v1.h[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h3, v0.h[1] -; CHECK-GI-NEXT: mov h4, v1.h[2] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[3] -; CHECK-GI-NEXT: fmov w9, s4 -; CHECK-GI-NEXT: mov h4, v0.h[3] -; CHECK-GI-NEXT: mov v1.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov h3, v0.h[2] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s3 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v1.b[3], w8 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: neg v1.8b, v1.8b ; CHECK-GI-NEXT: ushl v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: mov b1, v0.b[1] @@ -1006,12 +944,8 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){ ; ; CHECK-GI-LABEL: lshr_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v1.s[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w9, v0.s[1] -; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: neg v1.4h, v1.4h ; CHECK-GI-NEXT: ushl v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: mov h1, v0.h[1] diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index 954458e44597..5f4ff1e64673 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -209,27 +209,9 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){ ; ; CHECK-GI-LABEL: shufflevector_v4i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov h2, v0.h[1] -; CHECK-GI-NEXT: mov h3, v1.h[1] -; CHECK-GI-NEXT: mov h4, v0.h[2] -; CHECK-GI-NEXT: mov h5, v0.h[3] -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov h2, v1.h[2] -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov h3, v1.h[3] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: mov v1.b[1], w9 -; CHECK-GI-NEXT: fmov w8, s4 -; CHECK-GI-NEXT: fmov w9, s2 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: mov v1.b[2], w9 -; CHECK-GI-NEXT: fmov w8, s5 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: mov v0.b[3], w8 -; CHECK-GI-NEXT: mov v1.b[3], w9 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: adrp x8, .LCPI15_0 +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b @@ -284,13 +266,9 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){ ; ; CHECK-GI-LABEL: shufflevector_v2i16: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: adrp x8, .LCPI17_0 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0] ; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b @@ -403,16 +381,7 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){ ; ; CHECK-GI-LABEL: shufflevector_v4i8_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h1, v0.h[1] -; CHECK-GI-NEXT: mov h2, v0.h[2] -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov h1, v0.h[3] -; CHECK-GI-NEXT: mov v0.b[1], w8 -; CHECK-GI-NEXT: fmov w8, s2 -; CHECK-GI-NEXT: mov v0.b[2], w8 -; CHECK-GI-NEXT: fmov w8, s1 -; CHECK-GI-NEXT: mov v0.b[3], w8 +; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: dup v0.8b, v0.b[0] ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret @@ -448,9 +417,7 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){ ; ; CHECK-GI-LABEL: shufflevector_v2i16_zeroes: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: uzp1 v0.4h, v0.4h, v0.4h ; CHECK-GI-NEXT: dup v0.4h, v0.h[0] ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll index e21015ad3db3..b02788ab1b34 100644 --- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -186,10 +186,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: ldr q1, [x0] ; ALL-NEXT: stp x9, x8, [sp, #16] ; ALL-NEXT: mov x8, sp -; ALL-NEXT: and x9, x10, #0x1f +; ALL-NEXT: and x9, x10, #0x18 ; ALL-NEXT: str q1, [sp] ; ALL-NEXT: add x8, x8, x9 +; ALL-NEXT: lsl x9, x10, #3 ; ALL-NEXT: stp q0, q0, [sp, #32] +; ALL-NEXT: ldp x11, x10, [x8, #16] +; ALL-NEXT: mvn w13, w9 +; ALL-NEXT: ldp x8, x12, [x8] +; ALL-NEXT: and x9, x9, #0x38 +; ALL-NEXT: lsl x14, x10, #1 +; ALL-NEXT: lsl x15, x11, #1 +; ALL-NEXT: lsr x11, x11, x9 +; ALL-NEXT: lsl x16, x12, #1 +; ALL-NEXT: lsr x10, x10, x9 +; ALL-NEXT: lsr x12, x12, x9 +; ALL-NEXT: lsl x14, x14, x13 +; ALL-NEXT: lsr x8, x8, x9 +; ALL-NEXT: lsl x9, x16, x13 +; ALL-NEXT: lsl x13, x15, x13 +; ALL-NEXT: orr x11, x14, x11 +; ALL-NEXT: orr x8, x9, x8 +; ALL-NEXT: orr x9, x12, x13 +; ALL-NEXT: stp x11, x10, [x2, #16] +; ALL-NEXT: stp x8, x9, [x2] +; ALL-NEXT: add sp, sp, #64 +; ALL-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; ALL-LABEL: lshr_32bytes_dwordOff: +; ALL: // %bb.0: +; ALL-NEXT: sub sp, sp, #64 +; ALL-NEXT: ldp x9, x8, [x0, #16] +; ALL-NEXT: movi v0.2d, #0000000000000000 +; ALL-NEXT: ldr x10, [x1] +; ALL-NEXT: ldr q1, [x0] +; ALL-NEXT: stp x9, x8, [sp, #16] +; ALL-NEXT: ubfiz x8, x10, #3, #2 +; ALL-NEXT: mov x9, sp +; ALL-NEXT: str q1, [sp] +; ALL-NEXT: stp q0, q0, [sp, #32] +; ALL-NEXT: add x8, x9, x8 ; ALL-NEXT: ldp x10, x9, [x8, #16] ; ALL-NEXT: ldr q0, [x8] ; ALL-NEXT: str q0, [x2] @@ -197,12 +241,13 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 %res = lshr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-LABEL: shl_32bytes: ; ALL: // %bb.0: @@ -213,11 +258,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: ldr q1, [x0] ; ALL-NEXT: stp x9, x8, [sp, #48] ; ALL-NEXT: mov x8, sp -; ALL-NEXT: and x9, x10, #0x1f +; ALL-NEXT: and x9, x10, #0x18 ; ALL-NEXT: add x8, x8, #32 ; ALL-NEXT: stp q0, q0, [sp] ; ALL-NEXT: str q1, [sp, #32] ; ALL-NEXT: sub x8, x8, x9 +; ALL-NEXT: lsl x9, x10, #3 +; ALL-NEXT: ldp x10, x11, [x8] +; ALL-NEXT: ldp x12, x8, [x8, #16] +; ALL-NEXT: mvn w13, w9 +; ALL-NEXT: and x9, x9, #0x38 +; ALL-NEXT: lsr x14, x10, #1 +; ALL-NEXT: lsr x15, x11, #1 +; ALL-NEXT: lsl x11, x11, x9 +; ALL-NEXT: lsr x16, x12, #1 +; ALL-NEXT: lsl x10, x10, x9 +; ALL-NEXT: lsl x12, x12, x9 +; ALL-NEXT: lsr x14, x14, x13 +; ALL-NEXT: lsl x8, x8, x9 +; ALL-NEXT: lsr x9, x16, x13 +; ALL-NEXT: lsr x13, x15, x13 +; ALL-NEXT: orr x11, x11, x14 +; ALL-NEXT: orr x8, x8, x9 +; ALL-NEXT: orr x9, x12, x13 +; ALL-NEXT: stp x10, x11, [x2] +; ALL-NEXT: stp x9, x8, [x2, #16] +; ALL-NEXT: add sp, sp, #64 +; ALL-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; ALL-LABEL: shl_32bytes_dwordOff: +; ALL: // %bb.0: +; ALL-NEXT: sub sp, sp, #64 +; ALL-NEXT: ldp x9, x8, [x0, #16] +; ALL-NEXT: movi v0.2d, #0000000000000000 +; ALL-NEXT: ldr x10, [x1] +; ALL-NEXT: ldr q1, [x0] +; ALL-NEXT: stp x9, x8, [sp, #48] +; ALL-NEXT: mov x8, sp +; ALL-NEXT: ubfiz x9, x10, #3, #2 +; ALL-NEXT: add x8, x8, #32 +; ALL-NEXT: stp q0, q1, [sp, #16] +; ALL-NEXT: str q0, [sp] +; ALL-NEXT: sub x8, x8, x9 ; ALL-NEXT: ldp x9, x10, [x8, #16] ; ALL-NEXT: ldr q0, [x8] ; ALL-NEXT: str q0, [x2] @@ -225,12 +315,13 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 %res = shl i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-LABEL: ashr_32bytes: ; ALL: // %bb.0: @@ -238,14 +329,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: ldp x9, x8, [x0, #16] ; ALL-NEXT: ldr x10, [x1] ; ALL-NEXT: ldr q0, [x0] -; ALL-NEXT: and x10, x10, #0x1f +; ALL-NEXT: and x11, x10, #0x18 ; ALL-NEXT: stp x9, x8, [sp, #16] ; ALL-NEXT: asr x8, x8, #63 ; ALL-NEXT: mov x9, sp ; ALL-NEXT: str q0, [sp] +; ALL-NEXT: add x9, x9, x11 +; ALL-NEXT: stp x8, x8, [sp, #48] +; ALL-NEXT: stp x8, x8, [sp, #32] +; ALL-NEXT: lsl x8, x10, #3 +; ALL-NEXT: ldp x11, x10, [x9, #16] +; ALL-NEXT: ldp x9, x12, [x9] +; ALL-NEXT: mvn w13, w8 +; ALL-NEXT: and x8, x8, #0x38 +; ALL-NEXT: lsl x14, x10, #1 +; ALL-NEXT: lsl x15, x11, #1 +; ALL-NEXT: lsr x11, x11, x8 +; ALL-NEXT: lsl x16, x12, #1 +; ALL-NEXT: asr x10, x10, x8 +; ALL-NEXT: lsr x12, x12, x8 +; ALL-NEXT: lsl x14, x14, x13 +; ALL-NEXT: lsr x8, x9, x8 +; ALL-NEXT: lsl x9, x16, x13 +; ALL-NEXT: lsl x13, x15, x13 +; ALL-NEXT: orr x11, x14, x11 +; ALL-NEXT: orr x8, x9, x8 +; ALL-NEXT: orr x9, x12, x13 +; ALL-NEXT: stp x11, x10, [x2, #16] +; ALL-NEXT: stp x8, x9, [x2] +; ALL-NEXT: add sp, sp, #64 +; ALL-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; ALL-LABEL: ashr_32bytes_dwordOff: +; ALL: // %bb.0: +; ALL-NEXT: sub sp, sp, #64 +; ALL-NEXT: ldp x9, x8, [x0, #16] +; ALL-NEXT: ldr x10, [x1] +; ALL-NEXT: ldr q0, [x0] +; ALL-NEXT: stp x9, x8, [sp, #16] +; ALL-NEXT: asr x8, x8, #63 +; ALL-NEXT: ubfiz x9, x10, #3, #2 +; ALL-NEXT: mov x10, sp +; ALL-NEXT: str q0, [sp] ; ALL-NEXT: stp x8, x8, [sp, #48] ; ALL-NEXT: stp x8, x8, [sp, #32] -; ALL-NEXT: add x8, x9, x10 +; ALL-NEXT: add x8, x10, x9 ; ALL-NEXT: ldp x10, x9, [x8, #16] ; ALL-NEXT: ldr q0, [x8] ; ALL-NEXT: str q0, [x2] @@ -253,8 +389,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 %res = ashr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll index a4da6db57eca..531e0fa740da 100644 --- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll @@ -160,30 +160,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: ldr x10, [x1] ; ALL-NEXT: ldr q1, [x0] ; ALL-NEXT: stp x9, x8, [sp, #16] -; ALL-NEXT: ubfx x8, x10, #3, #5 +; ALL-NEXT: lsr x8, x10, #3 ; ALL-NEXT: mov x9, sp ; ALL-NEXT: str q1, [sp] -; ALL-NEXT: and x10, x10, #0x7 +; ALL-NEXT: and x12, x10, #0x3f +; ALL-NEXT: and x8, x8, #0x18 ; ALL-NEXT: stp q0, q0, [sp, #32] +; ALL-NEXT: eor x12, x12, #0x3f ; ALL-NEXT: add x8, x9, x8 -; ALL-NEXT: mvn w13, w10 -; ALL-NEXT: ldp x11, x9, [x8, #16] -; ALL-NEXT: ldp x8, x12, [x8] +; ALL-NEXT: ldp x13, x11, [x8] +; ALL-NEXT: ldr x9, [x8, #24] +; ALL-NEXT: ldr x8, [x8, #16] ; ALL-NEXT: lsl x14, x9, #1 +; ALL-NEXT: lsr x9, x9, x10 ; ALL-NEXT: lsl x15, x11, #1 ; ALL-NEXT: lsr x11, x11, x10 -; ALL-NEXT: lsl x16, x12, #1 -; ALL-NEXT: lsr x9, x9, x10 -; ALL-NEXT: lsr x12, x12, x10 -; ALL-NEXT: lsl x14, x14, x13 +; ALL-NEXT: lsr x13, x13, x10 +; ALL-NEXT: lsl x14, x14, x12 +; ALL-NEXT: lsl x12, x15, x12 +; ALL-NEXT: lsl x15, x8, #1 ; ALL-NEXT: lsr x8, x8, x10 -; ALL-NEXT: lsl x10, x16, x13 -; ALL-NEXT: lsl x13, x15, x13 -; ALL-NEXT: orr x11, x14, x11 -; ALL-NEXT: stp x11, x9, [x2, #16] -; ALL-NEXT: orr x8, x10, x8 +; ALL-NEXT: mvn w10, w10 +; ALL-NEXT: lsl x10, x15, x10 +; ALL-NEXT: orr x8, x14, x8 +; ALL-NEXT: stp x8, x9, [x2, #16] ; ALL-NEXT: orr x9, x12, x13 -; ALL-NEXT: stp x8, x9, [x2] +; ALL-NEXT: orr x8, x11, x10 +; ALL-NEXT: stp x9, x8, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 @@ -201,31 +204,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: ldr x10, [x1] ; ALL-NEXT: ldr q1, [x0] ; ALL-NEXT: stp x9, x8, [sp, #48] -; ALL-NEXT: mov x8, sp -; ALL-NEXT: ubfx x9, x10, #3, #5 -; ALL-NEXT: add x8, x8, #32 +; ALL-NEXT: lsr x8, x10, #3 +; ALL-NEXT: mov x9, sp +; ALL-NEXT: add x9, x9, #32 ; ALL-NEXT: stp q0, q1, [sp, #16] -; ALL-NEXT: and x10, x10, #0x7 +; ALL-NEXT: and x12, x10, #0x3f +; ALL-NEXT: and x8, x8, #0x18 ; ALL-NEXT: str q0, [sp] -; ALL-NEXT: sub x8, x8, x9 -; ALL-NEXT: mvn w13, w10 -; ALL-NEXT: ldp x9, x11, [x8] -; ALL-NEXT: ldp x12, x8, [x8, #16] -; ALL-NEXT: lsr x14, x9, #1 -; ALL-NEXT: lsr x15, x11, #1 -; ALL-NEXT: lsl x11, x11, x10 -; ALL-NEXT: lsr x16, x12, #1 +; ALL-NEXT: eor x12, x12, #0x3f +; ALL-NEXT: sub x8, x9, x8 +; ALL-NEXT: ldp x11, x13, [x8, #16] +; ALL-NEXT: ldr x9, [x8] +; ALL-NEXT: ldr x8, [x8, #8] +; ALL-NEXT: lsr x15, x9, #1 ; ALL-NEXT: lsl x9, x9, x10 -; ALL-NEXT: lsl x12, x12, x10 -; ALL-NEXT: lsr x14, x14, x13 +; ALL-NEXT: lsr x14, x11, #1 +; ALL-NEXT: lsl x11, x11, x10 +; ALL-NEXT: lsl x13, x13, x10 +; ALL-NEXT: lsr x14, x14, x12 +; ALL-NEXT: lsr x12, x15, x12 +; ALL-NEXT: lsr x15, x8, #1 ; ALL-NEXT: lsl x8, x8, x10 -; ALL-NEXT: lsr x10, x16, x13 -; ALL-NEXT: lsr x13, x15, x13 -; ALL-NEXT: orr x11, x11, x14 -; ALL-NEXT: stp x9, x11, [x2] -; ALL-NEXT: orr x8, x8, x10 -; ALL-NEXT: orr x9, x12, x13 -; ALL-NEXT: stp x9, x8, [x2, #16] +; ALL-NEXT: mvn w10, w10 +; ALL-NEXT: lsr x10, x15, x10 +; ALL-NEXT: orr x8, x8, x12 +; ALL-NEXT: stp x9, x8, [x2] +; ALL-NEXT: orr x9, x13, x14 +; ALL-NEXT: orr x8, x11, x10 +; ALL-NEXT: stp x8, x9, [x2, #16] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 @@ -243,31 +249,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ALL-NEXT: ldr x10, [x1] ; ALL-NEXT: ldr q0, [x0] ; ALL-NEXT: stp x9, x8, [sp, #16] +; ALL-NEXT: lsr x9, x10, #3 ; ALL-NEXT: asr x8, x8, #63 -; ALL-NEXT: ubfx x9, x10, #3, #5 ; ALL-NEXT: str q0, [sp] -; ALL-NEXT: and x10, x10, #0x7 +; ALL-NEXT: and x12, x10, #0x3f +; ALL-NEXT: and x9, x9, #0x18 ; ALL-NEXT: stp x8, x8, [sp, #48] -; ALL-NEXT: add x9, x11, x9 -; ALL-NEXT: mvn w13, w10 +; ALL-NEXT: eor x12, x12, #0x3f ; ALL-NEXT: stp x8, x8, [sp, #32] -; ALL-NEXT: ldp x11, x8, [x9, #16] -; ALL-NEXT: ldp x9, x12, [x9] -; ALL-NEXT: lsl x14, x8, #1 +; ALL-NEXT: add x8, x11, x9 +; ALL-NEXT: ldp x13, x11, [x8] +; ALL-NEXT: ldr x9, [x8, #24] +; ALL-NEXT: ldr x8, [x8, #16] +; ALL-NEXT: lsl x14, x9, #1 +; ALL-NEXT: asr x9, x9, x10 ; ALL-NEXT: lsl x15, x11, #1 ; ALL-NEXT: lsr x11, x11, x10 -; ALL-NEXT: lsl x16, x12, #1 -; ALL-NEXT: asr x8, x8, x10 -; ALL-NEXT: lsr x12, x12, x10 -; ALL-NEXT: lsl x14, x14, x13 -; ALL-NEXT: lsr x9, x9, x10 -; ALL-NEXT: lsl x10, x16, x13 -; ALL-NEXT: lsl x13, x15, x13 -; ALL-NEXT: orr x11, x14, x11 -; ALL-NEXT: stp x11, x8, [x2, #16] -; ALL-NEXT: orr x8, x10, x9 +; ALL-NEXT: lsr x13, x13, x10 +; ALL-NEXT: lsl x14, x14, x12 +; ALL-NEXT: lsl x12, x15, x12 +; ALL-NEXT: lsl x15, x8, #1 +; ALL-NEXT: lsr x8, x8, x10 +; ALL-NEXT: mvn w10, w10 +; ALL-NEXT: lsl x10, x15, x10 +; ALL-NEXT: orr x8, x14, x8 +; ALL-NEXT: stp x8, x9, [x2, #16] ; ALL-NEXT: orr x9, x12, x13 -; ALL-NEXT: stp x8, x9, [x2] +; ALL-NEXT: orr x8, x11, x10 +; ALL-NEXT: stp x9, x8, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll index ead790203f94..fb3f8ebd7d14 100644 --- a/llvm/test/CodeGen/AArch64/xtn.ll +++ b/llvm/test/CodeGen/AArch64/xtn.ll @@ -294,19 +294,10 @@ entry: } define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) { -; CHECK-SD-LABEL: xtn_v3i32_v3i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: xtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: xtn_v3i32_v3i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, v0.s[1] -; CHECK-GI-NEXT: mov w9, v0.s[2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v0.h[2], w9 -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: xtn_v3i32_v3i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %arg1 = trunc <3 x i32> %a to <3 x i16> ret <3 x i16> %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index bb7bc0447aea..c5ded11c7d32 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -167,8 +167,8 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa ; GFX10-NEXT: s_cbranch_execz .LBB3_6 ; GFX10-NEXT: .LBB3_2: ; %loop_start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8 ; GFX10-NEXT: s_mov_b32 s7, 1 +; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8 ; GFX10-NEXT: s_cbranch_vccz .LBB3_4 ; GFX10-NEXT: ; %bb.3: ; %else ; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 49c232661c6d..b27d8fdc24ff 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -75,12 +75,12 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr ; GFX10-NEXT: .LBB1_1: ; %loop.cond ; GFX10-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v1, s4, v1, 4 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s4, 0, v2, s4 -; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0 ; GFX10-NEXT: s_andn2_b32 s7, s5, exec_lo ; GFX10-NEXT: s_and_b32 s8, exec_lo, s6 +; GFX10-NEXT: v_cmp_le_i32_e32 vcc_lo, 10, v0 ; GFX10-NEXT: s_or_b32 s4, s7, s8 ; GFX10-NEXT: s_cbranch_vccz .LBB1_4 ; GFX10-NEXT: .LBB1_2: ; %loop.start @@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB3_6 ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll new file mode 100644 index 000000000000..c8ba6722d9d8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -0,0 +1,73 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefix=OBJDUMP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=ASM %s + +; OBJDUMP: Contents of section .rodata: +; OBJDUMP-NEXT: 0000 00000000 00000000 10010000 00000000 ................ +; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................ +; OBJDUMP-NEXT: 0030 4000af00 94130000 1a000400 00000000 @............... + +; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6: +; ASM: .amdhsa_user_sgpr_count 10 +; ASM: .amdhsa_next_free_sgpr 10 +; ASM: ; NumSgprs: 16 +; ASM: ; NumSGPRsForWavesPerEU: 16 + +; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT +; feild that are not explicitly referenced in the kernel. This test has 6 implicit +; user SPGRs enabled, 4 preloaded kernarg SGPRs, plus 6 extra SGPRs allocated +; for flat scratch, ect. The total number of allocated SGPRs encoded in the +; kernel descriptor should be 16. That's a 1 in the KD field since the granule +; size is 8 and it's NumGranules - 1. The encoding for that looks like '40'. + +define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret void } + +; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000 ........ ....... +; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000 @............... + +; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2: +; ASM: .amdhsa_user_sgpr_count 10 +; ASM: .amdhsa_next_free_sgpr 10 +; ASM: ; NumSgprs: 16 +; ASM: ; NumSGPRsForWavesPerEU: 16 + +; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2 +; implicit, and 6 extra. + +define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 { ret void } + +; OBJDUMP-NEXT: 0080 00000000 00000000 08010000 00000000 ................ +; OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 00b0 4000af00 86000000 08000100 00000000 @............... + +; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2: +; ASM: .amdhsa_user_sgpr_count 3 +; ASM: .amdhsa_next_free_sgpr 3 +; ASM: ; NumSgprs: 9 +; ASM: ; NumSGPRsForWavesPerEU: 9 + +; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD. + +define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { ret void } + +; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000 ................ +; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000 ................ +; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000 ................ + +; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2: +; ASM: .amdhsa_user_sgpr_count 2 +; ASM: .amdhsa_next_free_sgpr 0 +; ASM: ; NumSgprs: 6 +; ASM: ; NumSGPRsForWavesPerEU: 6 + +; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD. +; Encoded like '00'. + +define amdgpu_kernel void @amdhsa_kernarg_preload_0_implicit_2(i32) #0 { ret void } + +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index b17dfc7c3754..ce608df44dc4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1323,9 +1323,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -1451,10 +1451,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -1587,9 +1586,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -3228,8 +3227,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 @@ -4991,9 +4990,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 @@ -5119,10 +5118,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2 @@ -5255,9 +5253,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_wait_alu 0xfffe ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 @@ -6938,8 +6936,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s8, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 988bc8eec6e5..ce90fbed8131 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -936,8 +936,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -1047,8 +1047,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -2684,8 +2684,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 @@ -2874,8 +2874,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 @@ -3383,8 +3383,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1 +; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 @@ -4444,8 +4444,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 @@ -4555,8 +4555,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 @@ -6218,8 +6218,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 @@ -6408,8 +6408,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 @@ -6915,8 +6915,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 @@ -7026,9 +7026,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 @@ -7627,8 +7626,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 @@ -7786,8 +7785,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 @@ -8294,8 +8293,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 @@ -8405,8 +8404,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 @@ -9006,8 +9005,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 @@ -9165,8 +9164,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 @@ -9673,8 +9672,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 @@ -9784,8 +9783,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 @@ -10385,8 +10384,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 @@ -10544,8 +10543,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 @@ -11051,8 +11050,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 @@ -11162,9 +11161,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 @@ -12196,8 +12194,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 @@ -12415,8 +12413,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 @@ -12923,8 +12921,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 @@ -13034,9 +13032,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 @@ -14788,8 +14785,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 @@ -14899,8 +14896,8 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 @@ -15909,8 +15906,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 @@ -16125,8 +16122,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 @@ -16633,8 +16630,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 @@ -16744,9 +16741,8 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 @@ -17754,8 +17750,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 @@ -17970,8 +17966,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 2b18f472c8c4..c3a197ce9985 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -1263,16 +1263,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: @@ -1483,16 +1483,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -2471,16 +2471,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: @@ -2721,16 +2721,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -4503,16 +4503,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: @@ -4753,16 +4753,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -5929,19 +5929,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s33, s8 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: @@ -6378,19 +6378,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 ; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -7595,8 +7595,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -8020,16 +8020,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: @@ -8277,16 +8277,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -9107,8 +9107,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 @@ -9444,16 +9444,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: @@ -9701,16 +9701,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -10531,8 +10531,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 @@ -11437,8 +11437,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 @@ -13574,8 +13574,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index e3144ae24ae8..69c6adf0300c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: @@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: @@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index ddc103184cdf..b7890f30f776 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -3348,17 +3348,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: @@ -3778,17 +3778,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -5038,8 +5038,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -6403,8 +6403,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6844,17 +6844,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: @@ -7274,17 +7274,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -8534,8 +8534,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index f353edff1b47..fcd5d0dc497e 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -1367,16 +1367,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: @@ -1617,16 +1617,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -2687,16 +2687,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: @@ -2937,16 +2937,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -4823,16 +4823,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: @@ -5073,16 +5073,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -6249,19 +6249,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s33, s8 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-NEXT: s_mov_b32 s44, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: @@ -6698,19 +6698,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 ; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -7915,8 +7915,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 @@ -8340,16 +8340,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: @@ -8597,16 +8597,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -9426,8 +9426,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 @@ -9763,16 +9763,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: @@ -10020,16 +10020,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -10850,8 +10850,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 @@ -11756,8 +11756,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 @@ -13892,8 +13892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index b061d53de5d3..39a3b1c8adc9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -2,11 +2,118 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 -declare i64 @llvm.amdgcn.readfirstlane.i64(i64) #0 -declare double @llvm.amdgcn.readfirstlane.f64(double) #0 +define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %src) + store i1 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_i1_inreg(ptr addrspace(1) %out, i1 inreg %src) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1_inreg: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: s_and_b32 s4, s6, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1_inreg: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: s_and_b32 s4, s6, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %src) + store i1 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_i1_select(ptr addrspace(1) %out, i32 %src, i32 %src1) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1_select: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2 +; CHECK-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-SDAG-NEXT: s_bitcmp1_b32 s4, 0 +; CHECK-SDAG-NEXT: s_cselect_b64 vcc, -1, 0 +; CHECK-SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1_select: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: v_cmp_lt_u32_e32 vcc, 42, v2 +; CHECK-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v4 +; CHECK-GISEL-NEXT: s_and_b32 s4, 1, s4 +; CHECK-GISEL-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; CHECK-GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp ugt i32 %src, 42 + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %cmp) + %sel = select i1 %readfirstlane, i32 %src, i32 %src1 + store i32 %sel, ptr addrspace(1) %out, align 4 + ret void +} -define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 { +define void @test_readfirstlane_i1_load(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; CHECK-SDAG-LABEL: test_readfirstlane_i1_load: +; CHECK-SDAG: ; %bb.0: +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-SDAG-NEXT: flat_load_ubyte v2, v[2:3] +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-SDAG-NEXT: s_and_b32 s4, s4, 1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-SDAG-NEXT: flat_store_byte v[0:1], v2 +; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) +; CHECK-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CHECK-GISEL-LABEL: test_readfirstlane_i1_load: +; CHECK-GISEL: ; %bb.0: +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-GISEL-NEXT: flat_load_ubyte v2, v[2:3] +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; CHECK-GISEL-NEXT: s_and_b32 s4, s4, 1 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-GISEL-NEXT: flat_store_byte v[0:1], v2 +; CHECK-GISEL-NEXT: s_waitcnt vmcnt(0) +; CHECK-GISEL-NEXT: s_setpc_b64 s[30:31] + %load = load i1, ptr addrspace(1) %in + %readfirstlane = call i1 @llvm.amdgcn.readfirstlane.i1(i1 %load) + store i1 %readfirstlane, ptr addrspace(1) %out, align 4 + ret void +} + +define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -29,7 +136,7 @@ define void @test_readfirstlane_i32(ptr addrspace(1) %out, i32 %src) #1 { ret void } -define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 { +define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -56,7 +163,7 @@ define void @test_readfirstlane_i64(ptr addrspace(1) %out, i64 %src) #1 { ret void } -define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 { +define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -83,7 +190,7 @@ define void @test_readfirstlane_f64(ptr addrspace(1) %out, double %src) #1 { ret void } -define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_mov_b32 s0, 32 @@ -104,7 +211,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i32(ptr addrspace(1) %out) #1 ret void } -define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_mov_b64 s[0:1], 32 @@ -125,7 +232,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_i64(ptr addrspace(1) %out) #1 ret void } -define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_mov_b32 s0, 0 @@ -148,7 +255,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) #1 ret void } -define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -173,7 +280,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -201,7 +308,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -230,7 +337,7 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -262,7 +369,7 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 { ret void } -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -294,7 +401,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ret void } -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -328,7 +435,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ret void } -define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 @@ -362,7 +469,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ret void } -define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 { +define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_fi: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_add_u32 s0, s0, s15 @@ -593,6 +700,3 @@ define void @test_readfirstlane_v8i16(ptr addrspace(1) %out, <8 x i16> %src) { call void asm sideeffect "; use $0", "s"(<8 x i16> %x) ret void } - -attributes #0 = { nounwind readnone convergent } -attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 684ca3aac7c3..004a720b9ab4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -216,8 +216,8 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 ; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 ; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index f60786c1bacb..6f841c88a6d8 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -4,8 +4,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_then: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 @@ -60,8 +60,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_else_vgpr_opt: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll index 90b32e29e98f..3519befabd3b 100644 --- a/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll @@ -4,10 +4,10 @@ define amdgpu_cs void @should_not_hoist_set_inactive(<4 x i32> inreg %i14, i32 inreg %v, i32 %lane, i32 %f, i32 %f2) #0 { ; GCN-LABEL: should_not_hoist_set_inactive: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1 ; GCN-NEXT: v_cmp_eq_u32_e64 s5, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s6, 0, v2 ; GCN-NEXT: s_mov_b32 s7, 0 +; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v1 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %bb4 ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir index 1a7ec5db9efa..be759049bc3a 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-true16.mir @@ -11,7 +11,7 @@ body: | ; GFX1100-LABEL: name: 16bit_lo128_shrink ; GFX1100: liveins: $vgpr127 ; GFX1100-NEXT: {{ $}} - ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc, implicit $exec, implicit $exec + ; GFX1100-NEXT: V_CMP_EQ_U16_t16_e32 0, $vgpr127, implicit-def $vcc_lo, implicit $exec, implicit $exec $vcc_lo = V_CMP_EQ_U16_t16_e64 0, $vgpr127, implicit-def $vcc, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir new file mode 100644 index 000000000000..73c55265af20 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-v-cmp-wave32-dead-vcc-lo.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=si-shrink-instructions -mcpu=gfx1100 -o - %s | FileCheck %s + +# Make sure there's no crash when shrinking a v_cmp on a wave32 target +# when the def is dead. Previously the vcc implicit def wasn't +# properly replaced with vcc_lo, so the expected implicit operand was +# not found in the shrunk instruction. + +--- +name: shrink_v_cmp_vcc_lo_dead +tracksRegLiveness: true +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_dead + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def dead $vcc_lo, implicit $exec + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31 + dead renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31 + +... + +--- +name: shrink_v_cmp_vcc_lo_live +tracksRegLiveness: true +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: shrink_v_cmp_vcc_lo_live + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: V_CMP_LT_U32_e32 $vgpr0, $vgpr1, implicit-def $vcc_lo, implicit $exec + ; CHECK-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo + renamable $vcc_lo = V_CMP_LT_U32_e64 $vgpr0, $vgpr1, implicit $exec + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vcc_lo + +... diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index eebd32cd67e6..8e0a83671a18 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1027,8 +1027,8 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; ; GFX10-WAVE32-LABEL: test_kill_divergent_loop: ; GFX10-WAVE32: ; %bb.0: ; %entry -; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index 25d8300eb458..a0bce3432a4b 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -86,8 +86,8 @@ end: define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 { ; SI-LABEL: else3: ; SI: ; %bb.0: ; %entry -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_mov_b32 s1, 0 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_branch .LBB2_2 ; SI-NEXT: .LBB2_1: ; %if.end ; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1 @@ -161,16 +161,16 @@ for.end: define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 { ; SI-LABEL: loop: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: v_mov_b32_e32 v0, v1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s6, exec_lo, s0 @@ -243,11 +243,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: v_mov_b32_e32 v40, v1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s6, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 92117e0688f6..4576d829b0cb 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -372,8 +372,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: .LBB10_2: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 -; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_4 ; GFX1032-NEXT: ; %bb.3: ; %bb5 @@ -515,8 +515,8 @@ bb13: define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader diff --git a/llvm/test/CodeGen/ARM/vbsl.ll b/llvm/test/CodeGen/ARM/vbsl.ll index 735fa5182fe7..8564a48fbc3d 100644 --- a/llvm/test/CodeGen/ARM/vbsl.ll +++ b/llvm/test/CodeGen/ARM/vbsl.ll @@ -1,17 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s - -; rdar://12471808 +; RUN: llc -mtriple=armv7-eabihf -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @v_bsli8(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: v_bsli8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vorr d0, d18, d18 ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d17, d16 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C @@ -27,10 +25,10 @@ define <4 x i16> @v_bsli16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vorr d0, d18, d18 ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d17, d16 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C @@ -46,10 +44,10 @@ define <2 x i32> @v_bsli32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vorr d0, d18, d18 ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d17, d16 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C @@ -65,10 +63,10 @@ define <1 x i64> @v_bsli64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d18, [r0] ; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vorr d0, d18, d18 ; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d17, d16 +; CHECK-NEXT: bx lr %tmp1 = load <1 x i64>, ptr %A %tmp2 = load <1 x i64>, ptr %B %tmp3 = load <1 x i64>, ptr %C @@ -83,12 +81,11 @@ define <16 x i8> @v_bslQi8(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: v_bslQi8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vorr q0, q10, q10 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vbit q8, q9, q10 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q9, q8 +; CHECK-NEXT: bx lr %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B %tmp3 = load <16 x i8>, ptr %C @@ -103,12 +100,11 @@ define <8 x i16> @v_bslQi16(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: v_bslQi16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vorr q0, q10, q10 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vbit q8, q9, q10 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q9, q8 +; CHECK-NEXT: bx lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = load <8 x i16>, ptr %C @@ -123,12 +119,11 @@ define <4 x i32> @v_bslQi32(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: v_bslQi32: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vorr q0, q10, q10 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vbit q8, q9, q10 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q9, q8 +; CHECK-NEXT: bx lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B %tmp3 = load <4 x i32>, ptr %C @@ -143,12 +138,11 @@ define <2 x i64> @v_bslQi64(ptr %A, ptr %B, ptr %C) nounwind { ; CHECK-LABEL: v_bslQi64: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vorr q0, q10, q10 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vbit q8, q9, q10 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q9, q8 +; CHECK-NEXT: bx lr %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i64>, ptr %B %tmp3 = load <2 x i64>, ptr %C @@ -162,12 +156,8 @@ define <2 x i64> @v_bslQi64(ptr %A, ptr %B, ptr %C) nounwind { define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: f1: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [sp] -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind ret <8 x i8> %vbsl.i } @@ -175,12 +165,8 @@ define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone opt define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: f2: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [sp] -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind ret <4 x i16> %vbsl3.i } @@ -188,12 +174,8 @@ define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: f3: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [sp] -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind ret <2 x i32> %vbsl3.i } @@ -201,12 +183,8 @@ define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: f4: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [sp] -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind ret <2 x float> %vbsl4.i } @@ -214,16 +192,8 @@ define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g1: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbit q8, q10, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind ret <16 x i8> %vbsl.i } @@ -231,16 +201,8 @@ define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g2: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbit q8, q10, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind ret <8 x i16> %vbsl3.i } @@ -248,16 +210,8 @@ define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g3: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbit q8, q10, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind ret <4 x i32> %vbsl3.i } @@ -265,16 +219,8 @@ define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: g4: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbit q8, q10, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind ret <4 x float> %vbsl4.i } @@ -282,12 +228,8 @@ define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: test_vbsl_s64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [sp] -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind ret <1 x i64> %vbsl3.i } @@ -295,12 +237,8 @@ define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwi define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: test_vbsl_u64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [sp] -; CHECK-NEXT: vmov d17, r2, r3 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vbit d16, d17, d18 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind ret <1 x i64> %vbsl3.i } @@ -308,16 +246,8 @@ define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwi define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: test_vbslq_s64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbit q8, q10, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind ret <2 x i64> %vbsl3.i } @@ -325,16 +255,8 @@ define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounw define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: test_vbslq_u64: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: add r12, sp, #16 -; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d16, d17}, [r12] -; CHECK-NEXT: vld1.64 {d20, d21}, [r0] -; CHECK-NEXT: vbit q8, q10, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind ret <2 x i64> %vbsl3.i } diff --git a/llvm/test/CodeGen/Generic/allow-check.ll b/llvm/test/CodeGen/Generic/allow-check.ll index a08488959862..148ee811ea80 100644 --- a/llvm/test/CodeGen/Generic/allow-check.ll +++ b/llvm/test/CodeGen/Generic/allow-check.ll @@ -1,5 +1,5 @@ ; Avoid `!DL->isLittleEndian() && !CLI->enableBigEndian()` missmatch on PPC64BE. -; REQUIRES: host-byteorder-little-endian +; REQUIRES: target-byteorder-little-endian ; -global-isel=1 is unsupported. ; XFAIL: target=loongarch{{.*}} diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir index 58e2e644b000..a40b4d85773b 100644 --- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir +++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir @@ -40,9 +40,9 @@ registers: - { id: 7, class: float32regs } body: | bb.0.entry: - %0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0 + %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test_param_0 %1 = CVT_f64_f32 %0, 0 - %2 = LD_i32_avar 0, 4, 1, 0, 32, &test_param_1 + %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test_param_1 ; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00 %3 = FADD_rnf64ri %1, double 3.250000e+00 %4 = CVT_f32_f64 %3, 5 @@ -66,9 +66,9 @@ registers: - { id: 7, class: float32regs } body: | bb.0.entry: - %0 = LD_f32_avar 0, 4, 1, 2, 32, &test2_param_0 + %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test2_param_0 %1 = CVT_f64_f32 %0, 0 - %2 = LD_i32_avar 0, 4, 1, 0, 32, &test2_param_1 + %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test2_param_1 ; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000 %3 = FADD_rnf64ri %1, double 0x7FF8000000000000 %4 = CVT_f32_f64 %3, 5 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll index 450fe968d491..2b8129acb91f 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/ashr.ll @@ -382,53 +382,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MIPS: # %bb.0: # %entry ; MIPS-NEXT: addiu $sp, $sp, -32 ; MIPS-NEXT: .cfi_def_cfa_offset 32 -; MIPS-NEXT: swl $7, 28($sp) -; MIPS-NEXT: swl $6, 24($sp) ; MIPS-NEXT: sra $1, $4, 31 -; MIPS-NEXT: swl $5, 20($sp) -; MIPS-NEXT: swl $4, 16($sp) -; MIPS-NEXT: swl $1, 12($sp) -; MIPS-NEXT: swl $1, 8($sp) -; MIPS-NEXT: swl $1, 4($sp) -; MIPS-NEXT: swl $1, 0($sp) -; MIPS-NEXT: addiu $2, $sp, 0 -; MIPS-NEXT: swr $7, 31($sp) -; MIPS-NEXT: swr $6, 27($sp) -; MIPS-NEXT: swr $5, 23($sp) -; MIPS-NEXT: swr $4, 19($sp) -; MIPS-NEXT: swr $1, 15($sp) -; MIPS-NEXT: swr $1, 11($sp) -; MIPS-NEXT: swr $1, 7($sp) -; MIPS-NEXT: swr $1, 3($sp) -; MIPS-NEXT: addiu $1, $2, 16 +; MIPS-NEXT: sw $7, 28($sp) +; MIPS-NEXT: sw $6, 24($sp) +; MIPS-NEXT: sw $5, 20($sp) +; MIPS-NEXT: sw $4, 16($sp) +; MIPS-NEXT: sw $1, 12($sp) +; MIPS-NEXT: sw $1, 8($sp) +; MIPS-NEXT: sw $1, 4($sp) +; MIPS-NEXT: sw $1, 0($sp) +; MIPS-NEXT: addiu $1, $sp, 0 +; MIPS-NEXT: addiu $1, $1, 16 ; MIPS-NEXT: lw $2, 60($sp) ; MIPS-NEXT: srl $3, $2, 3 -; MIPS-NEXT: andi $3, $3, 15 +; MIPS-NEXT: andi $3, $3, 12 ; MIPS-NEXT: subu $1, $1, $3 -; MIPS-NEXT: lwl $3, 4($1) -; MIPS-NEXT: lwr $3, 7($1) -; MIPS-NEXT: sll $4, $3, 1 -; MIPS-NEXT: lwl $5, 8($1) -; MIPS-NEXT: lwr $5, 11($1) -; MIPS-NEXT: andi $2, $2, 7 -; MIPS-NEXT: not $6, $2 -; MIPS-NEXT: srlv $7, $5, $2 -; MIPS-NEXT: sllv $4, $4, $6 +; MIPS-NEXT: lw $3, 4($1) +; MIPS-NEXT: lw $5, 8($1) +; MIPS-NEXT: srlv $4, $5, $2 +; MIPS-NEXT: sll $6, $3, 1 +; MIPS-NEXT: andi $7, $2, 31 +; MIPS-NEXT: xori $7, $7, 31 +; MIPS-NEXT: sllv $6, $6, $7 ; MIPS-NEXT: srlv $3, $3, $2 -; MIPS-NEXT: lwl $6, 0($1) -; MIPS-NEXT: lwr $6, 3($1) -; MIPS-NEXT: sll $8, $6, 1 -; MIPS-NEXT: xori $9, $2, 31 -; MIPS-NEXT: sllv $8, $8, $9 -; MIPS-NEXT: or $3, $3, $8 -; MIPS-NEXT: or $4, $7, $4 -; MIPS-NEXT: lwl $7, 12($1) -; MIPS-NEXT: lwr $7, 15($1) -; MIPS-NEXT: srlv $1, $7, $2 +; MIPS-NEXT: lw $8, 0($1) +; MIPS-NEXT: sll $9, $8, 1 +; MIPS-NEXT: sllv $9, $9, $7 +; MIPS-NEXT: or $3, $3, $9 +; MIPS-NEXT: or $4, $4, $6 +; MIPS-NEXT: lw $1, 12($1) +; MIPS-NEXT: srlv $1, $1, $2 ; MIPS-NEXT: sll $5, $5, 1 -; MIPS-NEXT: sllv $5, $5, $9 +; MIPS-NEXT: sllv $5, $5, $7 ; MIPS-NEXT: or $5, $1, $5 -; MIPS-NEXT: srav $2, $6, $2 +; MIPS-NEXT: srav $2, $8, $2 ; MIPS-NEXT: jr $ra ; MIPS-NEXT: addiu $sp, $sp, 32 ; @@ -436,53 +423,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MIPS32: # %bb.0: # %entry ; MIPS32-NEXT: addiu $sp, $sp, -32 ; MIPS32-NEXT: .cfi_def_cfa_offset 32 -; MIPS32-NEXT: swl $7, 28($sp) -; MIPS32-NEXT: swl $6, 24($sp) ; MIPS32-NEXT: sra $1, $4, 31 -; MIPS32-NEXT: swl $5, 20($sp) -; MIPS32-NEXT: swl $4, 16($sp) -; MIPS32-NEXT: swl $1, 12($sp) -; MIPS32-NEXT: swl $1, 8($sp) -; MIPS32-NEXT: swl $1, 4($sp) -; MIPS32-NEXT: swl $1, 0($sp) -; MIPS32-NEXT: addiu $2, $sp, 0 -; MIPS32-NEXT: swr $7, 31($sp) -; MIPS32-NEXT: swr $6, 27($sp) -; MIPS32-NEXT: swr $5, 23($sp) -; MIPS32-NEXT: swr $4, 19($sp) -; MIPS32-NEXT: swr $1, 15($sp) -; MIPS32-NEXT: swr $1, 11($sp) -; MIPS32-NEXT: swr $1, 7($sp) -; MIPS32-NEXT: swr $1, 3($sp) -; MIPS32-NEXT: addiu $1, $2, 16 +; MIPS32-NEXT: sw $7, 28($sp) +; MIPS32-NEXT: sw $6, 24($sp) +; MIPS32-NEXT: sw $5, 20($sp) +; MIPS32-NEXT: sw $4, 16($sp) +; MIPS32-NEXT: sw $1, 12($sp) +; MIPS32-NEXT: sw $1, 8($sp) +; MIPS32-NEXT: sw $1, 4($sp) +; MIPS32-NEXT: sw $1, 0($sp) +; MIPS32-NEXT: addiu $1, $sp, 0 +; MIPS32-NEXT: addiu $1, $1, 16 ; MIPS32-NEXT: lw $2, 60($sp) ; MIPS32-NEXT: srl $3, $2, 3 -; MIPS32-NEXT: andi $3, $3, 15 +; MIPS32-NEXT: andi $3, $3, 12 ; MIPS32-NEXT: subu $1, $1, $3 -; MIPS32-NEXT: lwl $3, 4($1) -; MIPS32-NEXT: lwr $3, 7($1) -; MIPS32-NEXT: sll $4, $3, 1 -; MIPS32-NEXT: lwl $5, 8($1) -; MIPS32-NEXT: lwr $5, 11($1) -; MIPS32-NEXT: andi $2, $2, 7 -; MIPS32-NEXT: not $6, $2 -; MIPS32-NEXT: srlv $7, $5, $2 -; MIPS32-NEXT: sllv $4, $4, $6 +; MIPS32-NEXT: lw $3, 4($1) +; MIPS32-NEXT: lw $5, 8($1) +; MIPS32-NEXT: srlv $4, $5, $2 +; MIPS32-NEXT: sll $6, $3, 1 +; MIPS32-NEXT: andi $7, $2, 31 +; MIPS32-NEXT: xori $7, $7, 31 +; MIPS32-NEXT: sllv $6, $6, $7 ; MIPS32-NEXT: srlv $3, $3, $2 -; MIPS32-NEXT: lwl $6, 0($1) -; MIPS32-NEXT: lwr $6, 3($1) -; MIPS32-NEXT: sll $8, $6, 1 -; MIPS32-NEXT: xori $9, $2, 31 -; MIPS32-NEXT: sllv $8, $8, $9 -; MIPS32-NEXT: or $3, $3, $8 -; MIPS32-NEXT: or $4, $7, $4 -; MIPS32-NEXT: lwl $7, 12($1) -; MIPS32-NEXT: lwr $7, 15($1) -; MIPS32-NEXT: srlv $1, $7, $2 +; MIPS32-NEXT: lw $8, 0($1) +; MIPS32-NEXT: sll $9, $8, 1 +; MIPS32-NEXT: sllv $9, $9, $7 +; MIPS32-NEXT: or $3, $3, $9 +; MIPS32-NEXT: or $4, $4, $6 +; MIPS32-NEXT: lw $1, 12($1) +; MIPS32-NEXT: srlv $1, $1, $2 ; MIPS32-NEXT: sll $5, $5, 1 -; MIPS32-NEXT: sllv $5, $5, $9 +; MIPS32-NEXT: sllv $5, $5, $7 ; MIPS32-NEXT: or $5, $1, $5 -; MIPS32-NEXT: srav $2, $6, $2 +; MIPS32-NEXT: srav $2, $8, $2 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: addiu $sp, $sp, 32 ; @@ -490,52 +464,40 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; 32R2: # %bb.0: # %entry ; 32R2-NEXT: addiu $sp, $sp, -32 ; 32R2-NEXT: .cfi_def_cfa_offset 32 -; 32R2-NEXT: swl $7, 28($sp) -; 32R2-NEXT: swl $6, 24($sp) -; 32R2-NEXT: swl $5, 20($sp) ; 32R2-NEXT: sra $1, $4, 31 -; 32R2-NEXT: swl $4, 16($sp) -; 32R2-NEXT: swl $1, 12($sp) -; 32R2-NEXT: swl $1, 8($sp) -; 32R2-NEXT: swl $1, 4($sp) -; 32R2-NEXT: swl $1, 0($sp) -; 32R2-NEXT: swr $7, 31($sp) -; 32R2-NEXT: swr $6, 27($sp) -; 32R2-NEXT: swr $5, 23($sp) -; 32R2-NEXT: swr $4, 19($sp) -; 32R2-NEXT: swr $1, 15($sp) -; 32R2-NEXT: swr $1, 11($sp) -; 32R2-NEXT: swr $1, 7($sp) -; 32R2-NEXT: swr $1, 3($sp) +; 32R2-NEXT: sw $7, 28($sp) +; 32R2-NEXT: sw $6, 24($sp) +; 32R2-NEXT: sw $5, 20($sp) +; 32R2-NEXT: sw $4, 16($sp) +; 32R2-NEXT: sw $1, 12($sp) +; 32R2-NEXT: sw $1, 8($sp) +; 32R2-NEXT: sw $1, 4($sp) +; 32R2-NEXT: sw $1, 0($sp) ; 32R2-NEXT: addiu $1, $sp, 0 ; 32R2-NEXT: addiu $1, $1, 16 ; 32R2-NEXT: lw $2, 60($sp) -; 32R2-NEXT: ext $3, $2, 3, 4 +; 32R2-NEXT: srl $3, $2, 3 +; 32R2-NEXT: andi $3, $3, 12 ; 32R2-NEXT: subu $1, $1, $3 -; 32R2-NEXT: lwl $3, 4($1) -; 32R2-NEXT: lwr $3, 7($1) -; 32R2-NEXT: sll $4, $3, 1 -; 32R2-NEXT: lwl $5, 8($1) -; 32R2-NEXT: lwr $5, 11($1) -; 32R2-NEXT: andi $2, $2, 7 -; 32R2-NEXT: not $6, $2 -; 32R2-NEXT: srlv $7, $5, $2 -; 32R2-NEXT: sllv $4, $4, $6 +; 32R2-NEXT: lw $3, 4($1) +; 32R2-NEXT: lw $5, 8($1) +; 32R2-NEXT: srlv $4, $5, $2 +; 32R2-NEXT: sll $6, $3, 1 +; 32R2-NEXT: andi $7, $2, 31 +; 32R2-NEXT: xori $7, $7, 31 +; 32R2-NEXT: sllv $6, $6, $7 ; 32R2-NEXT: srlv $3, $3, $2 -; 32R2-NEXT: lwl $6, 0($1) -; 32R2-NEXT: lwr $6, 3($1) -; 32R2-NEXT: sll $8, $6, 1 -; 32R2-NEXT: xori $9, $2, 31 -; 32R2-NEXT: sllv $8, $8, $9 -; 32R2-NEXT: or $3, $3, $8 -; 32R2-NEXT: or $4, $7, $4 -; 32R2-NEXT: lwl $7, 12($1) -; 32R2-NEXT: lwr $7, 15($1) -; 32R2-NEXT: srlv $1, $7, $2 +; 32R2-NEXT: lw $8, 0($1) +; 32R2-NEXT: sll $9, $8, 1 +; 32R2-NEXT: sllv $9, $9, $7 +; 32R2-NEXT: or $3, $3, $9 +; 32R2-NEXT: or $4, $4, $6 +; 32R2-NEXT: lw $1, 12($1) +; 32R2-NEXT: srlv $1, $1, $2 ; 32R2-NEXT: sll $5, $5, 1 -; 32R2-NEXT: sllv $5, $5, $9 +; 32R2-NEXT: sllv $5, $5, $7 ; 32R2-NEXT: or $5, $1, $5 -; 32R2-NEXT: srav $2, $6, $2 +; 32R2-NEXT: srav $2, $8, $2 ; 32R2-NEXT: jr $ra ; 32R2-NEXT: addiu $sp, $sp, 32 ; @@ -555,28 +517,28 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; 32R6-NEXT: addiu $1, $sp, 0 ; 32R6-NEXT: addiu $1, $1, 16 ; 32R6-NEXT: lw $2, 60($sp) -; 32R6-NEXT: ext $3, $2, 3, 4 +; 32R6-NEXT: srl $3, $2, 3 +; 32R6-NEXT: andi $3, $3, 12 ; 32R6-NEXT: subu $1, $1, $3 ; 32R6-NEXT: lw $3, 4($1) -; 32R6-NEXT: sll $4, $3, 1 ; 32R6-NEXT: lw $5, 8($1) -; 32R6-NEXT: andi $2, $2, 7 -; 32R6-NEXT: not $6, $2 -; 32R6-NEXT: srlv $7, $5, $2 -; 32R6-NEXT: sllv $4, $4, $6 +; 32R6-NEXT: srlv $4, $5, $2 +; 32R6-NEXT: sll $6, $3, 1 +; 32R6-NEXT: andi $7, $2, 31 +; 32R6-NEXT: xori $7, $7, 31 +; 32R6-NEXT: sllv $6, $6, $7 ; 32R6-NEXT: srlv $3, $3, $2 -; 32R6-NEXT: lw $6, 0($1) -; 32R6-NEXT: sll $8, $6, 1 -; 32R6-NEXT: xori $9, $2, 31 -; 32R6-NEXT: sllv $8, $8, $9 -; 32R6-NEXT: or $3, $3, $8 -; 32R6-NEXT: or $4, $7, $4 +; 32R6-NEXT: lw $8, 0($1) +; 32R6-NEXT: sll $9, $8, 1 +; 32R6-NEXT: sllv $9, $9, $7 +; 32R6-NEXT: or $3, $3, $9 +; 32R6-NEXT: or $4, $4, $6 ; 32R6-NEXT: lw $1, 12($1) ; 32R6-NEXT: srlv $1, $1, $2 ; 32R6-NEXT: sll $5, $5, 1 -; 32R6-NEXT: sllv $5, $5, $9 +; 32R6-NEXT: sllv $5, $5, $7 ; 32R6-NEXT: or $5, $1, $5 -; 32R6-NEXT: srav $2, $6, $2 +; 32R6-NEXT: srav $2, $8, $2 ; 32R6-NEXT: jr $ra ; 32R6-NEXT: addiu $sp, $sp, 32 ; @@ -656,53 +618,37 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: swp $16, 32($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 -; MMR3-NEXT: swl $7, 28($sp) -; MMR3-NEXT: swl $6, 24($sp) -; MMR3-NEXT: swl $5, 20($sp) ; MMR3-NEXT: sra $1, $4, 31 -; MMR3-NEXT: swl $4, 16($sp) -; MMR3-NEXT: swl $1, 12($sp) -; MMR3-NEXT: swl $1, 8($sp) -; MMR3-NEXT: swl $1, 4($sp) -; MMR3-NEXT: swl $1, 0($sp) -; MMR3-NEXT: swr $7, 31($sp) -; MMR3-NEXT: swr $6, 27($sp) -; MMR3-NEXT: swr $5, 23($sp) -; MMR3-NEXT: swr $4, 19($sp) -; MMR3-NEXT: swr $1, 15($sp) -; MMR3-NEXT: swr $1, 11($sp) -; MMR3-NEXT: swr $1, 7($sp) -; MMR3-NEXT: swr $1, 3($sp) +; MMR3-NEXT: swp $6, 24($sp) +; MMR3-NEXT: swp $4, 16($sp) +; MMR3-NEXT: sw $1, 12($sp) +; MMR3-NEXT: sw $1, 8($sp) +; MMR3-NEXT: sw $1, 4($sp) +; MMR3-NEXT: sw $1, 0($sp) ; MMR3-NEXT: addiur1sp $2, 0 ; MMR3-NEXT: addiur2 $2, $2, 16 ; MMR3-NEXT: lw $3, 68($sp) -; MMR3-NEXT: ext $4, $3, 3, 4 -; MMR3-NEXT: subu16 $2, $2, $4 -; MMR3-NEXT: lwl $7, 4($2) -; MMR3-NEXT: lwr $7, 7($2) -; MMR3-NEXT: sll16 $4, $7, 1 -; MMR3-NEXT: lwl $5, 8($2) -; MMR3-NEXT: lwr $5, 11($2) -; MMR3-NEXT: andi16 $6, $3, 7 -; MMR3-NEXT: not16 $3, $6 -; MMR3-NEXT: andi16 $3, $3, 31 -; MMR3-NEXT: srlv $16, $5, $6 -; MMR3-NEXT: sllv $4, $4, $3 -; MMR3-NEXT: srlv $17, $7, $6 -; MMR3-NEXT: lwl $7, 0($2) -; MMR3-NEXT: lwr $7, 3($2) -; MMR3-NEXT: sll16 $3, $7, 1 -; MMR3-NEXT: xori $1, $6, 31 +; MMR3-NEXT: srl16 $4, $3, 3 +; MMR3-NEXT: andi $4, $4, 12 +; MMR3-NEXT: subu16 $5, $2, $4 +; MMR3-NEXT: lwp $6, 4($5) +; MMR3-NEXT: andi16 $2, $3, 31 +; MMR3-NEXT: srlv $16, $7, $2 +; MMR3-NEXT: sll16 $3, $6, 1 +; MMR3-NEXT: xori $1, $2, 31 +; MMR3-NEXT: sllv $4, $3, $1 +; MMR3-NEXT: srlv $6, $6, $2 +; MMR3-NEXT: lw16 $17, 0($5) +; MMR3-NEXT: sll16 $3, $17, 1 ; MMR3-NEXT: sllv $3, $3, $1 -; MMR3-NEXT: or16 $3, $17 +; MMR3-NEXT: or16 $3, $6 ; MMR3-NEXT: or16 $4, $16 -; MMR3-NEXT: lwl $8, 12($2) -; MMR3-NEXT: lwr $8, 15($2) -; MMR3-NEXT: srlv $2, $8, $6 -; MMR3-NEXT: sll16 $5, $5, 1 +; MMR3-NEXT: lw16 $5, 12($5) +; MMR3-NEXT: srlv $6, $5, $2 +; MMR3-NEXT: sll16 $5, $7, 1 ; MMR3-NEXT: sllv $5, $5, $1 -; MMR3-NEXT: or16 $5, $2 -; MMR3-NEXT: srav $2, $7, $6 +; MMR3-NEXT: or16 $5, $6 +; MMR3-NEXT: srav $2, $17, $2 ; MMR3-NEXT: lwp $16, 32($sp) ; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra @@ -714,40 +660,39 @@ define signext i128 @ashr_i128(i128 signext %a, i128 signext %b) { ; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 16, -4 ; MMR6-NEXT: sra $1, $4, 31 -; MMR6-NEXT: sw $7, 32($sp) -; MMR6-NEXT: sw $6, 28($sp) -; MMR6-NEXT: sw $5, 24($sp) -; MMR6-NEXT: sw $4, 20($sp) -; MMR6-NEXT: sw $1, 16($sp) +; MMR6-NEXT: sw $7, 28($sp) +; MMR6-NEXT: sw $6, 24($sp) +; MMR6-NEXT: sw $5, 20($sp) +; MMR6-NEXT: sw $4, 16($sp) ; MMR6-NEXT: sw $1, 12($sp) ; MMR6-NEXT: sw $1, 8($sp) ; MMR6-NEXT: sw $1, 4($sp) -; MMR6-NEXT: addiu $2, $sp, 4 +; MMR6-NEXT: sw $1, 0($sp) +; MMR6-NEXT: addiu $2, $sp, 0 ; MMR6-NEXT: addiur2 $2, $2, 16 ; MMR6-NEXT: lw $3, 68($sp) -; MMR6-NEXT: ext $4, $3, 3, 4 -; MMR6-NEXT: subu16 $5, $2, $4 -; MMR6-NEXT: lw16 $4, 4($5) -; MMR6-NEXT: sll16 $6, $4, 1 -; MMR6-NEXT: lw16 $7, 8($5) -; MMR6-NEXT: andi16 $2, $3, 7 -; MMR6-NEXT: not16 $3, $2 -; MMR6-NEXT: andi16 $3, $3, 31 -; MMR6-NEXT: srlv $1, $7, $2 -; MMR6-NEXT: sllv $6, $6, $3 -; MMR6-NEXT: srlv $3, $4, $2 -; MMR6-NEXT: lw16 $16, 0($5) +; MMR6-NEXT: srl16 $4, $3, 3 +; MMR6-NEXT: andi $4, $4, 12 +; MMR6-NEXT: subu16 $2, $2, $4 +; MMR6-NEXT: lw16 $4, 4($2) +; MMR6-NEXT: lw16 $5, 8($2) +; MMR6-NEXT: andi16 $6, $3, 31 +; MMR6-NEXT: srlv $1, $5, $6 +; MMR6-NEXT: sll16 $3, $4, 1 +; MMR6-NEXT: xori $7, $6, 31 +; MMR6-NEXT: sllv $8, $3, $7 +; MMR6-NEXT: srlv $3, $4, $6 +; MMR6-NEXT: lw16 $16, 0($2) ; MMR6-NEXT: sll16 $4, $16, 1 -; MMR6-NEXT: xori $8, $2, 31 -; MMR6-NEXT: sllv $4, $4, $8 +; MMR6-NEXT: sllv $4, $4, $7 ; MMR6-NEXT: or $3, $3, $4 -; MMR6-NEXT: or $4, $1, $6 -; MMR6-NEXT: lw16 $5, 12($5) -; MMR6-NEXT: srlv $1, $5, $2 -; MMR6-NEXT: sll16 $5, $7, 1 -; MMR6-NEXT: sllv $5, $5, $8 -; MMR6-NEXT: or $5, $1, $5 -; MMR6-NEXT: srav $2, $16, $2 +; MMR6-NEXT: or $4, $1, $8 +; MMR6-NEXT: lw16 $2, 12($2) +; MMR6-NEXT: srlv $1, $2, $6 +; MMR6-NEXT: sll16 $2, $5, 1 +; MMR6-NEXT: sllv $2, $2, $7 +; MMR6-NEXT: or $5, $1, $2 +; MMR6-NEXT: srav $2, $16, $6 ; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra diff --git a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll index 03cf104e3120..69b842c73db1 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/lshr.ll @@ -398,52 +398,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MIPS2: # %bb.0: # %entry ; MIPS2-NEXT: addiu $sp, $sp, -32 ; MIPS2-NEXT: .cfi_def_cfa_offset 32 -; MIPS2-NEXT: swl $7, 28($sp) -; MIPS2-NEXT: swl $6, 24($sp) -; MIPS2-NEXT: swl $5, 20($sp) -; MIPS2-NEXT: swl $4, 16($sp) -; MIPS2-NEXT: swl $zero, 12($sp) -; MIPS2-NEXT: swl $zero, 8($sp) -; MIPS2-NEXT: swl $zero, 4($sp) -; MIPS2-NEXT: swl $zero, 0($sp) ; MIPS2-NEXT: addiu $1, $sp, 0 -; MIPS2-NEXT: swr $7, 31($sp) -; MIPS2-NEXT: swr $6, 27($sp) -; MIPS2-NEXT: swr $5, 23($sp) -; MIPS2-NEXT: swr $4, 19($sp) -; MIPS2-NEXT: swr $zero, 15($sp) -; MIPS2-NEXT: swr $zero, 11($sp) -; MIPS2-NEXT: swr $zero, 7($sp) -; MIPS2-NEXT: swr $zero, 3($sp) +; MIPS2-NEXT: sw $7, 28($sp) +; MIPS2-NEXT: sw $6, 24($sp) +; MIPS2-NEXT: sw $5, 20($sp) +; MIPS2-NEXT: sw $4, 16($sp) ; MIPS2-NEXT: addiu $1, $1, 16 ; MIPS2-NEXT: lw $2, 60($sp) ; MIPS2-NEXT: srl $3, $2, 3 -; MIPS2-NEXT: andi $3, $3, 15 +; MIPS2-NEXT: andi $3, $3, 12 ; MIPS2-NEXT: subu $1, $1, $3 -; MIPS2-NEXT: lwl $3, 4($1) -; MIPS2-NEXT: lwr $3, 7($1) -; MIPS2-NEXT: sll $4, $3, 1 -; MIPS2-NEXT: lwl $5, 8($1) -; MIPS2-NEXT: lwr $5, 11($1) -; MIPS2-NEXT: andi $2, $2, 7 -; MIPS2-NEXT: not $6, $2 -; MIPS2-NEXT: srlv $7, $5, $2 -; MIPS2-NEXT: sllv $4, $4, $6 +; MIPS2-NEXT: sw $zero, 12($sp) +; MIPS2-NEXT: sw $zero, 8($sp) +; MIPS2-NEXT: sw $zero, 4($sp) +; MIPS2-NEXT: sw $zero, 0($sp) +; MIPS2-NEXT: lw $3, 4($1) +; MIPS2-NEXT: lw $5, 8($1) +; MIPS2-NEXT: srlv $4, $5, $2 +; MIPS2-NEXT: sll $6, $3, 1 +; MIPS2-NEXT: andi $7, $2, 31 +; MIPS2-NEXT: xori $7, $7, 31 +; MIPS2-NEXT: sllv $6, $6, $7 ; MIPS2-NEXT: srlv $3, $3, $2 -; MIPS2-NEXT: lwl $6, 0($1) -; MIPS2-NEXT: lwr $6, 3($1) -; MIPS2-NEXT: sll $8, $6, 1 -; MIPS2-NEXT: xori $9, $2, 31 -; MIPS2-NEXT: sllv $8, $8, $9 -; MIPS2-NEXT: or $3, $3, $8 -; MIPS2-NEXT: or $4, $7, $4 -; MIPS2-NEXT: lwl $7, 12($1) -; MIPS2-NEXT: lwr $7, 15($1) -; MIPS2-NEXT: srlv $1, $7, $2 +; MIPS2-NEXT: lw $8, 0($1) +; MIPS2-NEXT: sll $9, $8, 1 +; MIPS2-NEXT: sllv $9, $9, $7 +; MIPS2-NEXT: or $3, $3, $9 +; MIPS2-NEXT: or $4, $4, $6 +; MIPS2-NEXT: lw $1, 12($1) +; MIPS2-NEXT: srlv $1, $1, $2 ; MIPS2-NEXT: sll $5, $5, 1 -; MIPS2-NEXT: sllv $5, $5, $9 +; MIPS2-NEXT: sllv $5, $5, $7 ; MIPS2-NEXT: or $5, $1, $5 -; MIPS2-NEXT: srlv $2, $6, $2 +; MIPS2-NEXT: srlv $2, $8, $2 ; MIPS2-NEXT: jr $ra ; MIPS2-NEXT: addiu $sp, $sp, 32 ; @@ -451,52 +438,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MIPS32: # %bb.0: # %entry ; MIPS32-NEXT: addiu $sp, $sp, -32 ; MIPS32-NEXT: .cfi_def_cfa_offset 32 -; MIPS32-NEXT: swl $7, 28($sp) -; MIPS32-NEXT: swl $6, 24($sp) -; MIPS32-NEXT: swl $5, 20($sp) -; MIPS32-NEXT: swl $4, 16($sp) -; MIPS32-NEXT: swl $zero, 12($sp) -; MIPS32-NEXT: swl $zero, 8($sp) -; MIPS32-NEXT: swl $zero, 4($sp) -; MIPS32-NEXT: swl $zero, 0($sp) ; MIPS32-NEXT: addiu $1, $sp, 0 -; MIPS32-NEXT: swr $7, 31($sp) -; MIPS32-NEXT: swr $6, 27($sp) -; MIPS32-NEXT: swr $5, 23($sp) -; MIPS32-NEXT: swr $4, 19($sp) -; MIPS32-NEXT: swr $zero, 15($sp) -; MIPS32-NEXT: swr $zero, 11($sp) -; MIPS32-NEXT: swr $zero, 7($sp) -; MIPS32-NEXT: swr $zero, 3($sp) +; MIPS32-NEXT: sw $7, 28($sp) +; MIPS32-NEXT: sw $6, 24($sp) +; MIPS32-NEXT: sw $5, 20($sp) +; MIPS32-NEXT: sw $4, 16($sp) ; MIPS32-NEXT: addiu $1, $1, 16 ; MIPS32-NEXT: lw $2, 60($sp) ; MIPS32-NEXT: srl $3, $2, 3 -; MIPS32-NEXT: andi $3, $3, 15 +; MIPS32-NEXT: andi $3, $3, 12 ; MIPS32-NEXT: subu $1, $1, $3 -; MIPS32-NEXT: lwl $3, 4($1) -; MIPS32-NEXT: lwr $3, 7($1) -; MIPS32-NEXT: sll $4, $3, 1 -; MIPS32-NEXT: lwl $5, 8($1) -; MIPS32-NEXT: lwr $5, 11($1) -; MIPS32-NEXT: andi $2, $2, 7 -; MIPS32-NEXT: not $6, $2 -; MIPS32-NEXT: srlv $7, $5, $2 -; MIPS32-NEXT: sllv $4, $4, $6 +; MIPS32-NEXT: sw $zero, 12($sp) +; MIPS32-NEXT: sw $zero, 8($sp) +; MIPS32-NEXT: sw $zero, 4($sp) +; MIPS32-NEXT: sw $zero, 0($sp) +; MIPS32-NEXT: lw $3, 4($1) +; MIPS32-NEXT: lw $5, 8($1) +; MIPS32-NEXT: srlv $4, $5, $2 +; MIPS32-NEXT: sll $6, $3, 1 +; MIPS32-NEXT: andi $7, $2, 31 +; MIPS32-NEXT: xori $7, $7, 31 +; MIPS32-NEXT: sllv $6, $6, $7 ; MIPS32-NEXT: srlv $3, $3, $2 -; MIPS32-NEXT: lwl $6, 0($1) -; MIPS32-NEXT: lwr $6, 3($1) -; MIPS32-NEXT: sll $8, $6, 1 -; MIPS32-NEXT: xori $9, $2, 31 -; MIPS32-NEXT: sllv $8, $8, $9 -; MIPS32-NEXT: or $3, $3, $8 -; MIPS32-NEXT: or $4, $7, $4 -; MIPS32-NEXT: lwl $7, 12($1) -; MIPS32-NEXT: lwr $7, 15($1) -; MIPS32-NEXT: srlv $1, $7, $2 +; MIPS32-NEXT: lw $8, 0($1) +; MIPS32-NEXT: sll $9, $8, 1 +; MIPS32-NEXT: sllv $9, $9, $7 +; MIPS32-NEXT: or $3, $3, $9 +; MIPS32-NEXT: or $4, $4, $6 +; MIPS32-NEXT: lw $1, 12($1) +; MIPS32-NEXT: srlv $1, $1, $2 ; MIPS32-NEXT: sll $5, $5, 1 -; MIPS32-NEXT: sllv $5, $5, $9 +; MIPS32-NEXT: sllv $5, $5, $7 ; MIPS32-NEXT: or $5, $1, $5 -; MIPS32-NEXT: srlv $2, $6, $2 +; MIPS32-NEXT: srlv $2, $8, $2 ; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: addiu $sp, $sp, 32 ; @@ -504,51 +478,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MIPS32R2: # %bb.0: # %entry ; MIPS32R2-NEXT: addiu $sp, $sp, -32 ; MIPS32R2-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R2-NEXT: swl $7, 28($sp) -; MIPS32R2-NEXT: swl $6, 24($sp) -; MIPS32R2-NEXT: swl $5, 20($sp) -; MIPS32R2-NEXT: swl $4, 16($sp) -; MIPS32R2-NEXT: swl $zero, 12($sp) -; MIPS32R2-NEXT: swl $zero, 8($sp) -; MIPS32R2-NEXT: swl $zero, 4($sp) -; MIPS32R2-NEXT: swl $zero, 0($sp) -; MIPS32R2-NEXT: swr $7, 31($sp) -; MIPS32R2-NEXT: swr $6, 27($sp) -; MIPS32R2-NEXT: swr $5, 23($sp) -; MIPS32R2-NEXT: swr $4, 19($sp) -; MIPS32R2-NEXT: swr $zero, 15($sp) -; MIPS32R2-NEXT: swr $zero, 11($sp) -; MIPS32R2-NEXT: swr $zero, 7($sp) -; MIPS32R2-NEXT: swr $zero, 3($sp) ; MIPS32R2-NEXT: addiu $1, $sp, 0 +; MIPS32R2-NEXT: sw $7, 28($sp) +; MIPS32R2-NEXT: sw $6, 24($sp) +; MIPS32R2-NEXT: sw $5, 20($sp) +; MIPS32R2-NEXT: sw $4, 16($sp) ; MIPS32R2-NEXT: addiu $1, $1, 16 ; MIPS32R2-NEXT: lw $2, 60($sp) -; MIPS32R2-NEXT: ext $3, $2, 3, 4 +; MIPS32R2-NEXT: srl $3, $2, 3 +; MIPS32R2-NEXT: andi $3, $3, 12 ; MIPS32R2-NEXT: subu $1, $1, $3 -; MIPS32R2-NEXT: lwl $3, 4($1) -; MIPS32R2-NEXT: lwr $3, 7($1) -; MIPS32R2-NEXT: sll $4, $3, 1 -; MIPS32R2-NEXT: lwl $5, 8($1) -; MIPS32R2-NEXT: lwr $5, 11($1) -; MIPS32R2-NEXT: andi $2, $2, 7 -; MIPS32R2-NEXT: not $6, $2 -; MIPS32R2-NEXT: srlv $7, $5, $2 -; MIPS32R2-NEXT: sllv $4, $4, $6 +; MIPS32R2-NEXT: sw $zero, 12($sp) +; MIPS32R2-NEXT: sw $zero, 8($sp) +; MIPS32R2-NEXT: sw $zero, 4($sp) +; MIPS32R2-NEXT: sw $zero, 0($sp) +; MIPS32R2-NEXT: lw $3, 4($1) +; MIPS32R2-NEXT: lw $5, 8($1) +; MIPS32R2-NEXT: srlv $4, $5, $2 +; MIPS32R2-NEXT: sll $6, $3, 1 +; MIPS32R2-NEXT: andi $7, $2, 31 +; MIPS32R2-NEXT: xori $7, $7, 31 +; MIPS32R2-NEXT: sllv $6, $6, $7 ; MIPS32R2-NEXT: srlv $3, $3, $2 -; MIPS32R2-NEXT: lwl $6, 0($1) -; MIPS32R2-NEXT: lwr $6, 3($1) -; MIPS32R2-NEXT: sll $8, $6, 1 -; MIPS32R2-NEXT: xori $9, $2, 31 -; MIPS32R2-NEXT: sllv $8, $8, $9 -; MIPS32R2-NEXT: or $3, $3, $8 -; MIPS32R2-NEXT: or $4, $7, $4 -; MIPS32R2-NEXT: lwl $7, 12($1) -; MIPS32R2-NEXT: lwr $7, 15($1) -; MIPS32R2-NEXT: srlv $1, $7, $2 +; MIPS32R2-NEXT: lw $8, 0($1) +; MIPS32R2-NEXT: sll $9, $8, 1 +; MIPS32R2-NEXT: sllv $9, $9, $7 +; MIPS32R2-NEXT: or $3, $3, $9 +; MIPS32R2-NEXT: or $4, $4, $6 +; MIPS32R2-NEXT: lw $1, 12($1) +; MIPS32R2-NEXT: srlv $1, $1, $2 ; MIPS32R2-NEXT: sll $5, $5, 1 -; MIPS32R2-NEXT: sllv $5, $5, $9 +; MIPS32R2-NEXT: sllv $5, $5, $7 ; MIPS32R2-NEXT: or $5, $1, $5 -; MIPS32R2-NEXT: srlv $2, $6, $2 +; MIPS32R2-NEXT: srlv $2, $8, $2 ; MIPS32R2-NEXT: jr $ra ; MIPS32R2-NEXT: addiu $sp, $sp, 32 ; @@ -563,32 +525,32 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MIPS32R6-NEXT: sw $4, 16($sp) ; MIPS32R6-NEXT: addiu $1, $1, 16 ; MIPS32R6-NEXT: lw $2, 60($sp) -; MIPS32R6-NEXT: ext $3, $2, 3, 4 +; MIPS32R6-NEXT: srl $3, $2, 3 +; MIPS32R6-NEXT: andi $3, $3, 12 ; MIPS32R6-NEXT: subu $1, $1, $3 ; MIPS32R6-NEXT: sw $zero, 12($sp) ; MIPS32R6-NEXT: sw $zero, 8($sp) ; MIPS32R6-NEXT: sw $zero, 4($sp) ; MIPS32R6-NEXT: sw $zero, 0($sp) ; MIPS32R6-NEXT: lw $3, 4($1) -; MIPS32R6-NEXT: sll $4, $3, 1 ; MIPS32R6-NEXT: lw $5, 8($1) -; MIPS32R6-NEXT: andi $2, $2, 7 -; MIPS32R6-NEXT: not $6, $2 -; MIPS32R6-NEXT: srlv $7, $5, $2 -; MIPS32R6-NEXT: sllv $4, $4, $6 +; MIPS32R6-NEXT: srlv $4, $5, $2 +; MIPS32R6-NEXT: sll $6, $3, 1 +; MIPS32R6-NEXT: andi $7, $2, 31 +; MIPS32R6-NEXT: xori $7, $7, 31 +; MIPS32R6-NEXT: sllv $6, $6, $7 ; MIPS32R6-NEXT: srlv $3, $3, $2 -; MIPS32R6-NEXT: lw $6, 0($1) -; MIPS32R6-NEXT: sll $8, $6, 1 -; MIPS32R6-NEXT: xori $9, $2, 31 -; MIPS32R6-NEXT: sllv $8, $8, $9 -; MIPS32R6-NEXT: or $3, $3, $8 -; MIPS32R6-NEXT: or $4, $7, $4 +; MIPS32R6-NEXT: lw $8, 0($1) +; MIPS32R6-NEXT: sll $9, $8, 1 +; MIPS32R6-NEXT: sllv $9, $9, $7 +; MIPS32R6-NEXT: or $3, $3, $9 +; MIPS32R6-NEXT: or $4, $4, $6 ; MIPS32R6-NEXT: lw $1, 12($1) ; MIPS32R6-NEXT: srlv $1, $1, $2 ; MIPS32R6-NEXT: sll $5, $5, 1 -; MIPS32R6-NEXT: sllv $5, $5, $9 +; MIPS32R6-NEXT: sllv $5, $5, $7 ; MIPS32R6-NEXT: or $5, $1, $5 -; MIPS32R6-NEXT: srlv $2, $6, $2 +; MIPS32R6-NEXT: srlv $2, $8, $2 ; MIPS32R6-NEXT: jr $ra ; MIPS32R6-NEXT: addiu $sp, $sp, 32 ; @@ -677,53 +639,37 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: swp $16, 32($sp) ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 -; MMR3-NEXT: swl $7, 28($sp) -; MMR3-NEXT: swl $6, 24($sp) -; MMR3-NEXT: swl $5, 20($sp) ; MMR3-NEXT: li16 $2, 0 -; MMR3-NEXT: swl $4, 16($sp) -; MMR3-NEXT: swl $2, 12($sp) -; MMR3-NEXT: swl $2, 8($sp) -; MMR3-NEXT: swl $2, 4($sp) -; MMR3-NEXT: swl $2, 0($sp) -; MMR3-NEXT: swr $7, 31($sp) -; MMR3-NEXT: swr $6, 27($sp) -; MMR3-NEXT: swr $5, 23($sp) -; MMR3-NEXT: swr $4, 19($sp) -; MMR3-NEXT: swr $2, 15($sp) -; MMR3-NEXT: swr $2, 11($sp) -; MMR3-NEXT: swr $2, 7($sp) -; MMR3-NEXT: swr $2, 3($sp) +; MMR3-NEXT: swp $6, 24($sp) +; MMR3-NEXT: swp $4, 16($sp) +; MMR3-NEXT: sw $2, 12($sp) +; MMR3-NEXT: sw $2, 8($sp) +; MMR3-NEXT: sw $2, 4($sp) +; MMR3-NEXT: sw $2, 0($sp) ; MMR3-NEXT: addiur1sp $2, 0 ; MMR3-NEXT: addiur2 $2, $2, 16 ; MMR3-NEXT: lw $3, 68($sp) -; MMR3-NEXT: ext $4, $3, 3, 4 -; MMR3-NEXT: subu16 $2, $2, $4 -; MMR3-NEXT: lwl $7, 4($2) -; MMR3-NEXT: lwr $7, 7($2) -; MMR3-NEXT: sll16 $4, $7, 1 -; MMR3-NEXT: lwl $5, 8($2) -; MMR3-NEXT: lwr $5, 11($2) -; MMR3-NEXT: andi16 $6, $3, 7 -; MMR3-NEXT: not16 $3, $6 -; MMR3-NEXT: andi16 $3, $3, 31 -; MMR3-NEXT: srlv $16, $5, $6 -; MMR3-NEXT: sllv $4, $4, $3 -; MMR3-NEXT: srlv $17, $7, $6 -; MMR3-NEXT: lwl $7, 0($2) -; MMR3-NEXT: lwr $7, 3($2) -; MMR3-NEXT: sll16 $3, $7, 1 -; MMR3-NEXT: xori $1, $6, 31 +; MMR3-NEXT: srl16 $4, $3, 3 +; MMR3-NEXT: andi $4, $4, 12 +; MMR3-NEXT: subu16 $5, $2, $4 +; MMR3-NEXT: lwp $6, 4($5) +; MMR3-NEXT: andi16 $2, $3, 31 +; MMR3-NEXT: srlv $16, $7, $2 +; MMR3-NEXT: sll16 $3, $6, 1 +; MMR3-NEXT: xori $1, $2, 31 +; MMR3-NEXT: sllv $4, $3, $1 +; MMR3-NEXT: srlv $6, $6, $2 +; MMR3-NEXT: lw16 $17, 0($5) +; MMR3-NEXT: sll16 $3, $17, 1 ; MMR3-NEXT: sllv $3, $3, $1 -; MMR3-NEXT: or16 $3, $17 +; MMR3-NEXT: or16 $3, $6 ; MMR3-NEXT: or16 $4, $16 -; MMR3-NEXT: lwl $8, 12($2) -; MMR3-NEXT: lwr $8, 15($2) -; MMR3-NEXT: srlv $2, $8, $6 -; MMR3-NEXT: sll16 $5, $5, 1 +; MMR3-NEXT: lw16 $5, 12($5) +; MMR3-NEXT: srlv $6, $5, $2 +; MMR3-NEXT: sll16 $5, $7, 1 ; MMR3-NEXT: sllv $5, $5, $1 -; MMR3-NEXT: or16 $5, $2 -; MMR3-NEXT: srlv $2, $7, $6 +; MMR3-NEXT: or16 $5, $6 +; MMR3-NEXT: srlv $2, $17, $2 ; MMR3-NEXT: lwp $16, 32($sp) ; MMR3-NEXT: addiusp 40 ; MMR3-NEXT: jrc $ra @@ -735,40 +681,39 @@ define signext i128 @lshr_i128(i128 signext %a, i128 signext %b) { ; MMR6-NEXT: sw $16, 36($sp) # 4-byte Folded Spill ; MMR6-NEXT: .cfi_offset 16, -4 ; MMR6-NEXT: li16 $2, 0 -; MMR6-NEXT: sw $7, 32($sp) -; MMR6-NEXT: sw $6, 28($sp) -; MMR6-NEXT: sw $5, 24($sp) -; MMR6-NEXT: sw $4, 20($sp) -; MMR6-NEXT: sw $2, 16($sp) +; MMR6-NEXT: sw $7, 28($sp) +; MMR6-NEXT: sw $6, 24($sp) +; MMR6-NEXT: sw $5, 20($sp) +; MMR6-NEXT: sw $4, 16($sp) ; MMR6-NEXT: sw $2, 12($sp) ; MMR6-NEXT: sw $2, 8($sp) ; MMR6-NEXT: sw $2, 4($sp) -; MMR6-NEXT: addiu $2, $sp, 4 +; MMR6-NEXT: sw $2, 0($sp) +; MMR6-NEXT: addiu $2, $sp, 0 ; MMR6-NEXT: addiur2 $2, $2, 16 ; MMR6-NEXT: lw $3, 68($sp) -; MMR6-NEXT: ext $4, $3, 3, 4 -; MMR6-NEXT: subu16 $5, $2, $4 -; MMR6-NEXT: lw16 $4, 4($5) -; MMR6-NEXT: sll16 $6, $4, 1 -; MMR6-NEXT: lw16 $7, 8($5) -; MMR6-NEXT: andi16 $2, $3, 7 -; MMR6-NEXT: not16 $3, $2 -; MMR6-NEXT: andi16 $3, $3, 31 -; MMR6-NEXT: srlv $1, $7, $2 -; MMR6-NEXT: sllv $6, $6, $3 -; MMR6-NEXT: srlv $3, $4, $2 -; MMR6-NEXT: lw16 $16, 0($5) +; MMR6-NEXT: srl16 $4, $3, 3 +; MMR6-NEXT: andi $4, $4, 12 +; MMR6-NEXT: subu16 $2, $2, $4 +; MMR6-NEXT: lw16 $4, 4($2) +; MMR6-NEXT: lw16 $5, 8($2) +; MMR6-NEXT: andi16 $6, $3, 31 +; MMR6-NEXT: srlv $1, $5, $6 +; MMR6-NEXT: sll16 $3, $4, 1 +; MMR6-NEXT: xori $7, $6, 31 +; MMR6-NEXT: sllv $8, $3, $7 +; MMR6-NEXT: srlv $3, $4, $6 +; MMR6-NEXT: lw16 $16, 0($2) ; MMR6-NEXT: sll16 $4, $16, 1 -; MMR6-NEXT: xori $8, $2, 31 -; MMR6-NEXT: sllv $4, $4, $8 +; MMR6-NEXT: sllv $4, $4, $7 ; MMR6-NEXT: or $3, $3, $4 -; MMR6-NEXT: or $4, $1, $6 -; MMR6-NEXT: lw16 $5, 12($5) -; MMR6-NEXT: srlv $1, $5, $2 -; MMR6-NEXT: sll16 $5, $7, 1 -; MMR6-NEXT: sllv $5, $5, $8 -; MMR6-NEXT: or $5, $1, $5 -; MMR6-NEXT: srlv $2, $16, $2 +; MMR6-NEXT: or $4, $1, $8 +; MMR6-NEXT: lw16 $2, 12($2) +; MMR6-NEXT: srlv $1, $2, $6 +; MMR6-NEXT: sll16 $2, $5, 1 +; MMR6-NEXT: sllv $2, $2, $7 +; MMR6-NEXT: or $5, $1, $2 +; MMR6-NEXT: srlv $2, $16, $6 ; MMR6-NEXT: lw $16, 36($sp) # 4-byte Folded Reload ; MMR6-NEXT: addiu $sp, $sp, 40 ; MMR6-NEXT: jrc $ra diff --git a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll index af3d4f50f3fe..8d548861f439 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/sdiv.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) { +; MIPS2-LABEL: sdiv_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: move $2, $4 +; ; GP32-LABEL: sdiv_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: move $2, $4 ; +; MIPS3-LABEL: sdiv_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: move $2, $4 +; ; GP64-LABEL: sdiv_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,15 @@ entry: } define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) { +; MIPS2-LABEL: sdiv_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: sll $1, $1, 24 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 24 +; ; GP32R0R2-LABEL: sdiv_i8: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: div $zero, $4, $5 @@ -94,6 +113,15 @@ define signext i8 @sdiv_i8(i8 signext %a, i8 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seb $2, $1 ; +; MIPS3-LABEL: sdiv_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: sll $1, $1, 24 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 24 +; ; GP64R0R1-LABEL: sdiv_i8: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: div $zero, $4, $5 @@ -138,6 +166,15 @@ entry: } define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) { +; MIPS2-LABEL: sdiv_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: sll $1, $1, 16 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 16 +; ; GP32R0R2-LABEL: sdiv_i16: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: div $zero, $4, $5 @@ -162,6 +199,15 @@ define signext i16 @sdiv_i16(i16 signext %a, i16 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seh $2, $1 ; +; MIPS3-LABEL: sdiv_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: sll $1, $1, 16 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 16 +; ; GP64R0R1-LABEL: sdiv_i16: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: div $zero, $4, $5 @@ -206,6 +252,14 @@ entry: } define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: sdiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: sdiv_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -219,6 +273,14 @@ define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: sdiv_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: sdiv_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -250,6 +312,22 @@ entry: } define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: sdiv_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__divdi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: sdiv_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -281,6 +359,14 @@ define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: sdiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: sdiv_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddiv $zero, $4, $5 @@ -332,6 +418,30 @@ entry: } define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: sdiv_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__divti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: sdiv_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -379,6 +489,25 @@ define signext i128 @sdiv_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: sdiv_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(sdiv_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(sdiv_i128))) +; MIPS3-NEXT: ld $25, %call16(__divti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: sdiv_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll index 81f089a52947..394890a9dcc7 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/shl.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/shl.ll @@ -440,49 +440,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MIPS2: # %bb.0: # %entry ; MIPS2-NEXT: addiu $sp, $sp, -32 ; MIPS2-NEXT: .cfi_def_cfa_offset 32 -; MIPS2-NEXT: swl $zero, 28($sp) -; MIPS2-NEXT: swl $zero, 24($sp) -; MIPS2-NEXT: swl $zero, 20($sp) -; MIPS2-NEXT: swl $zero, 16($sp) -; MIPS2-NEXT: swl $7, 12($sp) -; MIPS2-NEXT: swl $6, 8($sp) -; MIPS2-NEXT: swl $5, 4($sp) -; MIPS2-NEXT: swl $4, 0($sp) -; MIPS2-NEXT: swr $zero, 31($sp) -; MIPS2-NEXT: swr $zero, 27($sp) -; MIPS2-NEXT: swr $zero, 23($sp) -; MIPS2-NEXT: swr $zero, 19($sp) -; MIPS2-NEXT: swr $7, 15($sp) -; MIPS2-NEXT: swr $6, 11($sp) -; MIPS2-NEXT: swr $5, 7($sp) -; MIPS2-NEXT: swr $4, 3($sp) ; MIPS2-NEXT: lw $1, 60($sp) ; MIPS2-NEXT: srl $2, $1, 3 -; MIPS2-NEXT: andi $2, $2, 15 +; MIPS2-NEXT: sw $7, 12($sp) +; MIPS2-NEXT: sw $6, 8($sp) +; MIPS2-NEXT: sw $5, 4($sp) +; MIPS2-NEXT: sw $4, 0($sp) +; MIPS2-NEXT: andi $2, $2, 12 ; MIPS2-NEXT: addiu $3, $sp, 0 ; MIPS2-NEXT: addu $4, $3, $2 -; MIPS2-NEXT: lwl $5, 8($4) -; MIPS2-NEXT: lwr $5, 11($4) -; MIPS2-NEXT: srl $2, $5, 1 -; MIPS2-NEXT: lwl $3, 4($4) -; MIPS2-NEXT: lwr $3, 7($4) -; MIPS2-NEXT: andi $1, $1, 7 -; MIPS2-NEXT: not $6, $1 -; MIPS2-NEXT: sllv $7, $3, $1 -; MIPS2-NEXT: srlv $6, $2, $6 -; MIPS2-NEXT: lwl $2, 0($4) -; MIPS2-NEXT: lwr $2, 3($4) -; MIPS2-NEXT: sllv $2, $2, $1 -; MIPS2-NEXT: srl $3, $3, 1 -; MIPS2-NEXT: xori $8, $1, 31 -; MIPS2-NEXT: srlv $3, $3, $8 -; MIPS2-NEXT: or $2, $2, $3 -; MIPS2-NEXT: or $3, $7, $6 +; MIPS2-NEXT: sw $zero, 28($sp) +; MIPS2-NEXT: sw $zero, 24($sp) +; MIPS2-NEXT: sw $zero, 20($sp) +; MIPS2-NEXT: sw $zero, 16($sp) +; MIPS2-NEXT: lw $5, 8($4) +; MIPS2-NEXT: lw $2, 4($4) +; MIPS2-NEXT: sllv $3, $2, $1 +; MIPS2-NEXT: srl $6, $5, 1 +; MIPS2-NEXT: andi $7, $1, 31 +; MIPS2-NEXT: xori $7, $7, 31 +; MIPS2-NEXT: srlv $6, $6, $7 +; MIPS2-NEXT: lw $8, 0($4) +; MIPS2-NEXT: sllv $8, $8, $1 +; MIPS2-NEXT: srl $2, $2, 1 +; MIPS2-NEXT: srlv $2, $2, $7 +; MIPS2-NEXT: or $2, $8, $2 +; MIPS2-NEXT: or $3, $3, $6 ; MIPS2-NEXT: sllv $5, $5, $1 -; MIPS2-NEXT: lwl $6, 12($4) -; MIPS2-NEXT: lwr $6, 15($4) +; MIPS2-NEXT: lw $6, 12($4) ; MIPS2-NEXT: srl $4, $6, 1 -; MIPS2-NEXT: srlv $4, $4, $8 +; MIPS2-NEXT: srlv $4, $4, $7 ; MIPS2-NEXT: or $4, $5, $4 ; MIPS2-NEXT: sllv $5, $6, $1 ; MIPS2-NEXT: jr $ra @@ -492,49 +479,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MIPS32: # %bb.0: # %entry ; MIPS32-NEXT: addiu $sp, $sp, -32 ; MIPS32-NEXT: .cfi_def_cfa_offset 32 -; MIPS32-NEXT: swl $zero, 28($sp) -; MIPS32-NEXT: swl $zero, 24($sp) -; MIPS32-NEXT: swl $zero, 20($sp) -; MIPS32-NEXT: swl $zero, 16($sp) -; MIPS32-NEXT: swl $7, 12($sp) -; MIPS32-NEXT: swl $6, 8($sp) -; MIPS32-NEXT: swl $5, 4($sp) -; MIPS32-NEXT: swl $4, 0($sp) -; MIPS32-NEXT: swr $zero, 31($sp) -; MIPS32-NEXT: swr $zero, 27($sp) -; MIPS32-NEXT: swr $zero, 23($sp) -; MIPS32-NEXT: swr $zero, 19($sp) -; MIPS32-NEXT: swr $7, 15($sp) -; MIPS32-NEXT: swr $6, 11($sp) -; MIPS32-NEXT: swr $5, 7($sp) -; MIPS32-NEXT: swr $4, 3($sp) ; MIPS32-NEXT: lw $1, 60($sp) ; MIPS32-NEXT: srl $2, $1, 3 -; MIPS32-NEXT: andi $2, $2, 15 +; MIPS32-NEXT: sw $7, 12($sp) +; MIPS32-NEXT: sw $6, 8($sp) +; MIPS32-NEXT: sw $5, 4($sp) +; MIPS32-NEXT: sw $4, 0($sp) +; MIPS32-NEXT: andi $2, $2, 12 ; MIPS32-NEXT: addiu $3, $sp, 0 ; MIPS32-NEXT: addu $4, $3, $2 -; MIPS32-NEXT: lwl $5, 8($4) -; MIPS32-NEXT: lwr $5, 11($4) -; MIPS32-NEXT: srl $2, $5, 1 -; MIPS32-NEXT: lwl $3, 4($4) -; MIPS32-NEXT: lwr $3, 7($4) -; MIPS32-NEXT: andi $1, $1, 7 -; MIPS32-NEXT: not $6, $1 -; MIPS32-NEXT: sllv $7, $3, $1 -; MIPS32-NEXT: srlv $6, $2, $6 -; MIPS32-NEXT: lwl $2, 0($4) -; MIPS32-NEXT: lwr $2, 3($4) -; MIPS32-NEXT: sllv $2, $2, $1 -; MIPS32-NEXT: srl $3, $3, 1 -; MIPS32-NEXT: xori $8, $1, 31 -; MIPS32-NEXT: srlv $3, $3, $8 -; MIPS32-NEXT: or $2, $2, $3 -; MIPS32-NEXT: or $3, $7, $6 +; MIPS32-NEXT: sw $zero, 28($sp) +; MIPS32-NEXT: sw $zero, 24($sp) +; MIPS32-NEXT: sw $zero, 20($sp) +; MIPS32-NEXT: sw $zero, 16($sp) +; MIPS32-NEXT: lw $5, 8($4) +; MIPS32-NEXT: lw $2, 4($4) +; MIPS32-NEXT: sllv $3, $2, $1 +; MIPS32-NEXT: srl $6, $5, 1 +; MIPS32-NEXT: andi $7, $1, 31 +; MIPS32-NEXT: xori $7, $7, 31 +; MIPS32-NEXT: srlv $6, $6, $7 +; MIPS32-NEXT: lw $8, 0($4) +; MIPS32-NEXT: sllv $8, $8, $1 +; MIPS32-NEXT: srl $2, $2, 1 +; MIPS32-NEXT: srlv $2, $2, $7 +; MIPS32-NEXT: or $2, $8, $2 +; MIPS32-NEXT: or $3, $3, $6 ; MIPS32-NEXT: sllv $5, $5, $1 -; MIPS32-NEXT: lwl $6, 12($4) -; MIPS32-NEXT: lwr $6, 15($4) +; MIPS32-NEXT: lw $6, 12($4) ; MIPS32-NEXT: srl $4, $6, 1 -; MIPS32-NEXT: srlv $4, $4, $8 +; MIPS32-NEXT: srlv $4, $4, $7 ; MIPS32-NEXT: or $4, $5, $4 ; MIPS32-NEXT: sllv $5, $6, $1 ; MIPS32-NEXT: jr $ra @@ -544,48 +518,36 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MIPS32R2: # %bb.0: # %entry ; MIPS32R2-NEXT: addiu $sp, $sp, -32 ; MIPS32R2-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R2-NEXT: swl $zero, 28($sp) -; MIPS32R2-NEXT: swl $zero, 24($sp) -; MIPS32R2-NEXT: swl $zero, 20($sp) -; MIPS32R2-NEXT: swl $zero, 16($sp) -; MIPS32R2-NEXT: swl $7, 12($sp) -; MIPS32R2-NEXT: swl $6, 8($sp) -; MIPS32R2-NEXT: swl $5, 4($sp) -; MIPS32R2-NEXT: swl $4, 0($sp) -; MIPS32R2-NEXT: swr $zero, 31($sp) -; MIPS32R2-NEXT: swr $zero, 27($sp) -; MIPS32R2-NEXT: swr $zero, 23($sp) -; MIPS32R2-NEXT: swr $zero, 19($sp) -; MIPS32R2-NEXT: swr $7, 15($sp) -; MIPS32R2-NEXT: swr $6, 11($sp) -; MIPS32R2-NEXT: swr $5, 7($sp) -; MIPS32R2-NEXT: swr $4, 3($sp) ; MIPS32R2-NEXT: lw $1, 60($sp) -; MIPS32R2-NEXT: ext $2, $1, 3, 4 +; MIPS32R2-NEXT: srl $2, $1, 3 +; MIPS32R2-NEXT: sw $7, 12($sp) +; MIPS32R2-NEXT: sw $6, 8($sp) +; MIPS32R2-NEXT: sw $5, 4($sp) +; MIPS32R2-NEXT: sw $4, 0($sp) +; MIPS32R2-NEXT: andi $2, $2, 12 ; MIPS32R2-NEXT: addiu $3, $sp, 0 ; MIPS32R2-NEXT: addu $4, $3, $2 -; MIPS32R2-NEXT: lwl $5, 8($4) -; MIPS32R2-NEXT: lwr $5, 11($4) -; MIPS32R2-NEXT: srl $2, $5, 1 -; MIPS32R2-NEXT: lwl $3, 4($4) -; MIPS32R2-NEXT: lwr $3, 7($4) -; MIPS32R2-NEXT: andi $1, $1, 7 -; MIPS32R2-NEXT: not $6, $1 -; MIPS32R2-NEXT: sllv $7, $3, $1 -; MIPS32R2-NEXT: srlv $6, $2, $6 -; MIPS32R2-NEXT: lwl $2, 0($4) -; MIPS32R2-NEXT: lwr $2, 3($4) -; MIPS32R2-NEXT: sllv $2, $2, $1 -; MIPS32R2-NEXT: srl $3, $3, 1 -; MIPS32R2-NEXT: xori $8, $1, 31 -; MIPS32R2-NEXT: srlv $3, $3, $8 -; MIPS32R2-NEXT: or $2, $2, $3 -; MIPS32R2-NEXT: or $3, $7, $6 +; MIPS32R2-NEXT: sw $zero, 28($sp) +; MIPS32R2-NEXT: sw $zero, 24($sp) +; MIPS32R2-NEXT: sw $zero, 20($sp) +; MIPS32R2-NEXT: sw $zero, 16($sp) +; MIPS32R2-NEXT: lw $5, 8($4) +; MIPS32R2-NEXT: lw $2, 4($4) +; MIPS32R2-NEXT: sllv $3, $2, $1 +; MIPS32R2-NEXT: srl $6, $5, 1 +; MIPS32R2-NEXT: andi $7, $1, 31 +; MIPS32R2-NEXT: xori $7, $7, 31 +; MIPS32R2-NEXT: srlv $6, $6, $7 +; MIPS32R2-NEXT: lw $8, 0($4) +; MIPS32R2-NEXT: sllv $8, $8, $1 +; MIPS32R2-NEXT: srl $2, $2, 1 +; MIPS32R2-NEXT: srlv $2, $2, $7 +; MIPS32R2-NEXT: or $2, $8, $2 +; MIPS32R2-NEXT: or $3, $3, $6 ; MIPS32R2-NEXT: sllv $5, $5, $1 -; MIPS32R2-NEXT: lwl $6, 12($4) -; MIPS32R2-NEXT: lwr $6, 15($4) +; MIPS32R2-NEXT: lw $6, 12($4) ; MIPS32R2-NEXT: srl $4, $6, 1 -; MIPS32R2-NEXT: srlv $4, $4, $8 +; MIPS32R2-NEXT: srlv $4, $4, $7 ; MIPS32R2-NEXT: or $4, $5, $4 ; MIPS32R2-NEXT: sllv $5, $6, $1 ; MIPS32R2-NEXT: jr $ra @@ -596,11 +558,12 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MIPS32R6-NEXT: addiu $sp, $sp, -32 ; MIPS32R6-NEXT: .cfi_def_cfa_offset 32 ; MIPS32R6-NEXT: lw $1, 60($sp) +; MIPS32R6-NEXT: srl $2, $1, 3 ; MIPS32R6-NEXT: sw $7, 12($sp) ; MIPS32R6-NEXT: sw $6, 8($sp) ; MIPS32R6-NEXT: sw $5, 4($sp) ; MIPS32R6-NEXT: sw $4, 0($sp) -; MIPS32R6-NEXT: ext $2, $1, 3, 4 +; MIPS32R6-NEXT: andi $2, $2, 12 ; MIPS32R6-NEXT: addiu $3, $sp, 0 ; MIPS32R6-NEXT: addu $4, $3, $2 ; MIPS32R6-NEXT: sw $zero, 28($sp) @@ -608,23 +571,22 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MIPS32R6-NEXT: sw $zero, 20($sp) ; MIPS32R6-NEXT: sw $zero, 16($sp) ; MIPS32R6-NEXT: lw $5, 8($4) -; MIPS32R6-NEXT: srl $2, $5, 1 -; MIPS32R6-NEXT: lw $3, 4($4) -; MIPS32R6-NEXT: andi $1, $1, 7 -; MIPS32R6-NEXT: not $6, $1 -; MIPS32R6-NEXT: sllv $7, $3, $1 -; MIPS32R6-NEXT: srlv $6, $2, $6 -; MIPS32R6-NEXT: lw $2, 0($4) -; MIPS32R6-NEXT: sllv $2, $2, $1 -; MIPS32R6-NEXT: srl $3, $3, 1 -; MIPS32R6-NEXT: xori $8, $1, 31 -; MIPS32R6-NEXT: srlv $3, $3, $8 -; MIPS32R6-NEXT: or $2, $2, $3 -; MIPS32R6-NEXT: or $3, $7, $6 +; MIPS32R6-NEXT: lw $2, 4($4) +; MIPS32R6-NEXT: sllv $3, $2, $1 +; MIPS32R6-NEXT: srl $6, $5, 1 +; MIPS32R6-NEXT: andi $7, $1, 31 +; MIPS32R6-NEXT: xori $7, $7, 31 +; MIPS32R6-NEXT: srlv $6, $6, $7 +; MIPS32R6-NEXT: lw $8, 0($4) +; MIPS32R6-NEXT: sllv $8, $8, $1 +; MIPS32R6-NEXT: srl $2, $2, 1 +; MIPS32R6-NEXT: srlv $2, $2, $7 +; MIPS32R6-NEXT: or $2, $8, $2 +; MIPS32R6-NEXT: or $3, $3, $6 ; MIPS32R6-NEXT: sllv $5, $5, $1 ; MIPS32R6-NEXT: lw $6, 12($4) ; MIPS32R6-NEXT: srl $4, $6, 1 -; MIPS32R6-NEXT: srlv $4, $4, $8 +; MIPS32R6-NEXT: srlv $4, $4, $7 ; MIPS32R6-NEXT: or $4, $5, $4 ; MIPS32R6-NEXT: sllv $5, $6, $1 ; MIPS32R6-NEXT: jr $ra @@ -722,47 +684,32 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MMR3-NEXT: .cfi_offset 17, -4 ; MMR3-NEXT: .cfi_offset 16, -8 ; MMR3-NEXT: li16 $2, 0 -; MMR3-NEXT: swl $2, 28($sp) -; MMR3-NEXT: swl $2, 24($sp) -; MMR3-NEXT: swl $2, 20($sp) -; MMR3-NEXT: swl $2, 16($sp) -; MMR3-NEXT: swl $7, 12($sp) -; MMR3-NEXT: swl $6, 8($sp) -; MMR3-NEXT: swl $5, 4($sp) -; MMR3-NEXT: swl $4, 0($sp) -; MMR3-NEXT: swr $2, 31($sp) -; MMR3-NEXT: swr $2, 27($sp) -; MMR3-NEXT: swr $2, 23($sp) -; MMR3-NEXT: swr $2, 19($sp) -; MMR3-NEXT: swr $7, 15($sp) -; MMR3-NEXT: swr $6, 11($sp) -; MMR3-NEXT: swr $5, 7($sp) -; MMR3-NEXT: swr $4, 3($sp) +; MMR3-NEXT: sw $2, 28($sp) +; MMR3-NEXT: sw $2, 24($sp) +; MMR3-NEXT: sw $2, 20($sp) +; MMR3-NEXT: sw $2, 16($sp) +; MMR3-NEXT: swp $6, 8($sp) +; MMR3-NEXT: swp $4, 0($sp) ; MMR3-NEXT: lw $2, 68($sp) -; MMR3-NEXT: ext $3, $2, 3, 4 +; MMR3-NEXT: srl16 $3, $2, 3 +; MMR3-NEXT: andi $3, $3, 12 ; MMR3-NEXT: addiur1sp $4, 0 ; MMR3-NEXT: addu16 $4, $4, $3 -; MMR3-NEXT: lwl $6, 8($4) -; MMR3-NEXT: lwr $6, 11($4) -; MMR3-NEXT: srl16 $3, $6, 1 -; MMR3-NEXT: lwl $7, 4($4) -; MMR3-NEXT: lwr $7, 7($4) -; MMR3-NEXT: andi16 $5, $2, 7 -; MMR3-NEXT: not16 $2, $5 -; MMR3-NEXT: andi16 $2, $2, 31 +; MMR3-NEXT: lw16 $6, 8($4) +; MMR3-NEXT: lw16 $7, 4($4) +; MMR3-NEXT: andi16 $5, $2, 31 ; MMR3-NEXT: sllv $16, $7, $5 -; MMR3-NEXT: srlv $3, $3, $2 -; MMR3-NEXT: lwl $1, 0($4) -; MMR3-NEXT: lwr $1, 3($4) -; MMR3-NEXT: sllv $17, $1, $5 -; MMR3-NEXT: srl16 $2, $7, 1 +; MMR3-NEXT: srl16 $2, $6, 1 ; MMR3-NEXT: xori $1, $5, 31 +; MMR3-NEXT: srlv $3, $2, $1 +; MMR3-NEXT: lw16 $2, 0($4) +; MMR3-NEXT: sllv $17, $2, $5 +; MMR3-NEXT: srl16 $2, $7, 1 ; MMR3-NEXT: srlv $2, $2, $1 ; MMR3-NEXT: or16 $2, $17 ; MMR3-NEXT: or16 $3, $16 ; MMR3-NEXT: sllv $6, $6, $5 -; MMR3-NEXT: lwl $7, 12($4) -; MMR3-NEXT: lwr $7, 15($4) +; MMR3-NEXT: lw16 $7, 12($4) ; MMR3-NEXT: srl16 $4, $7, 1 ; MMR3-NEXT: srlv $4, $4, $1 ; MMR3-NEXT: or16 $4, $6 @@ -785,30 +732,29 @@ define signext i128 @shl_i128(i128 signext %a, i128 signext %b) { ; MMR6-NEXT: sw $5, 4($sp) ; MMR6-NEXT: sw $4, 0($sp) ; MMR6-NEXT: lw $2, 60($sp) -; MMR6-NEXT: ext $3, $2, 3, 4 +; MMR6-NEXT: srl16 $3, $2, 3 +; MMR6-NEXT: andi $3, $3, 12 ; MMR6-NEXT: addiu $4, $sp, 0 ; MMR6-NEXT: addu16 $4, $4, $3 -; MMR6-NEXT: lw16 $6, 8($4) -; MMR6-NEXT: srl16 $3, $6, 1 -; MMR6-NEXT: lw16 $7, 4($4) -; MMR6-NEXT: andi16 $5, $2, 7 -; MMR6-NEXT: not16 $2, $5 -; MMR6-NEXT: andi16 $2, $2, 31 -; MMR6-NEXT: sllv $1, $7, $5 -; MMR6-NEXT: srlv $3, $3, $2 +; MMR6-NEXT: lw16 $5, 8($4) +; MMR6-NEXT: lw16 $3, 4($4) +; MMR6-NEXT: andi16 $6, $2, 31 +; MMR6-NEXT: sllv $1, $3, $6 +; MMR6-NEXT: srl16 $2, $5, 1 +; MMR6-NEXT: xori $7, $6, 31 +; MMR6-NEXT: srlv $8, $2, $7 ; MMR6-NEXT: lw16 $2, 0($4) -; MMR6-NEXT: sllv $2, $2, $5 -; MMR6-NEXT: srl16 $7, $7, 1 -; MMR6-NEXT: xori $8, $5, 31 -; MMR6-NEXT: srlv $7, $7, $8 -; MMR6-NEXT: or $2, $2, $7 -; MMR6-NEXT: or $3, $1, $3 -; MMR6-NEXT: sllv $1, $6, $5 -; MMR6-NEXT: lw16 $6, 12($4) -; MMR6-NEXT: srl16 $4, $6, 1 -; MMR6-NEXT: srlv $4, $4, $8 +; MMR6-NEXT: sllv $2, $2, $6 +; MMR6-NEXT: srl16 $3, $3, 1 +; MMR6-NEXT: srlv $3, $3, $7 +; MMR6-NEXT: or $2, $2, $3 +; MMR6-NEXT: or $3, $1, $8 +; MMR6-NEXT: sllv $1, $5, $6 +; MMR6-NEXT: lw16 $5, 12($4) +; MMR6-NEXT: srl16 $4, $5, 1 +; MMR6-NEXT: srlv $4, $4, $7 ; MMR6-NEXT: or $4, $1, $4 -; MMR6-NEXT: sllv $5, $6, $5 +; MMR6-NEXT: sllv $5, $5, $6 ; MMR6-NEXT: addiu $sp, $sp, 32 ; MMR6-NEXT: jrc $ra entry: diff --git a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll index 6349d5c64ab4..29cb34b8d970 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/srem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/srem.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define signext i1 @srem_i1(i1 signext %a, i1 signext %b) { +; MIPS2-LABEL: srem_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $2, $zero, 0 +; ; GP32-LABEL: srem_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define signext i1 @srem_i1(i1 signext %a, i1 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $2, $zero, 0 ; +; MIPS3-LABEL: srem_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: addiu $2, $zero, 0 +; ; GP64-LABEL: srem_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,14 @@ entry: } define signext i8 @srem_i8(i8 signext %a, i8 signext %b) { +; MIPS2-LABEL: srem_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: srem_i8: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -83,6 +101,14 @@ define signext i8 @srem_i8(i8 signext %a, i8 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: srem_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i8: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -114,6 +140,14 @@ entry: } define signext i16 @srem_i16(i16 signext %a, i16 signext %b) { +; MIPS2-LABEL: srem_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: srem_i16: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -127,6 +161,14 @@ define signext i16 @srem_i16(i16 signext %a, i16 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: srem_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i16: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -158,6 +200,14 @@ entry: } define signext i32 @srem_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: srem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: srem_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: div $zero, $4, $5 @@ -171,6 +221,14 @@ define signext i32 @srem_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: srem_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: div $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: div $zero, $4, $5 @@ -202,6 +260,22 @@ entry: } define signext i64 @srem_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: srem_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__moddi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: srem_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -233,6 +307,14 @@ define signext i64 @srem_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: srem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: srem_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddiv $zero, $4, $5 @@ -284,6 +366,30 @@ entry: } define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: srem_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__modti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: srem_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -331,6 +437,25 @@ define signext i128 @srem_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: srem_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(srem_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(srem_i128))) +; MIPS3-NEXT: ld $25, %call16(__modti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: srem_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll new file mode 100644 index 000000000000..db2c660e9bc7 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-mult.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -O3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +define signext i32 @mult_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: mult_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: mult $4, $5 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: mult $1, $6 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: mult_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: mul $1, $4, $5 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mul $2, $1, $6 +; +entry: + %mul = mul nsw i32 %a, %b + %mul1 = mul nsw i32 %mul, %c + ret i32 %mul1 +} + +define signext i64 @mul_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: mul_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: dmult $4, $5 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: dmult $1, $6 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: mul_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: dmult $4, $5 +; MIPS64-NEXT: mflo $1 +; MIPS64-NEXT: dmult $1, $6 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mflo $2 +; +entry: + %mul = mul i64 %a, %b + %mul1 = mul i64 %mul, %c + ret i64 %mul1 +} diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll new file mode 100644 index 000000000000..4ec5ecc9e2f1 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-sdiv.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @sdiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: sdiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: div $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: sdiv_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: div $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mflo $1 +; MIPS32-NEXT: div $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mflo $2 +; +entry: + %sdiv = sdiv i32 %a, %b + %sdiv1 = sdiv i32 %sdiv, %c + ret i32 %sdiv1 +} + +define signext i64 @sdiv_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: sdiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddiv $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: sdiv_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddiv $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mflo $1 +; MIPS64-NEXT: ddiv $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mflo $2 +; +entry: + %sdiv = sdiv i64 %a, %b + %sdiv1 = sdiv i64 %sdiv, %c + ret i64 %sdiv1 +} + +define signext i32 @sdiv_lw_sdiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: sdiv_lw_sdiv_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: sdiv_lw_sdiv_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %sdiv = sdiv i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %sdiv1 = sdiv i32 %sdiv, %2 + ret i32 %sdiv1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll new file mode 100644 index 000000000000..4f729b015b28 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-srem.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @srem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: srem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: div $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: div $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: srem_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: div $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mfhi $1 +; MIPS32-NEXT: div $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mfhi $2 +; +entry: + %rem = srem i32 %a, %b + %rem1 = srem i32 %rem, %c + ret i32 %rem1 +} + +define signext i64 @srem_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: srem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddiv $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddiv $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: srem_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddiv $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mfhi $1 +; MIPS64-NEXT: ddiv $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mfhi $2 +; +entry: + %rem = srem i64 %a, %b + %rem1 = srem i64 %rem, %c + ret i64 %rem1 +} + +define signext i32 @srem_lw_srem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: srem_lw_srem_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: div $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: srem_lw_srem_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: div $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %rem = srem i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %rem1 = srem i32 %rem, %2 + ret i32 %rem1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll new file mode 100644 index 000000000000..97ac0d8031cf --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-udiv.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @udiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: udiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: divu $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: udiv_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: divu $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mflo $1 +; MIPS32-NEXT: divu $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mflo $2 +; +entry: + %udiv = udiv i32 %a, %b + %udiv1 = udiv i32 %udiv, %c + ret i32 %udiv1 +} + +define signext i64 @udiv_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: udiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddivu $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: udiv_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddivu $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mflo $1 +; MIPS64-NEXT: ddivu $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mflo $2 +; +entry: + %udiv = udiv i64 %a, %b + %udiv1 = udiv i64 %udiv, %c + ret i64 %udiv1 +} + +define signext i32 @udiv_lw_udiv_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: udiv_lw_udiv_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mflo $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: udiv_lw_udiv_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mflo $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %udiv = udiv i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %udiv1 = udiv i32 %udiv, %2 + ret i32 %udiv1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll new file mode 100644 index 000000000000..e1819f1d57b7 --- /dev/null +++ b/llvm/test/CodeGen/Mips/llvm-ir/two-consecutive-urem.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32 + +; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 +; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS64 + +; RUN: llc < %s -mtriple=mips -mcpu=mips2 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2-O0 +; RUN: llc < %s -mtriple=mips -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS32-O0 + +define signext i32 @urem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-LABEL: urem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: nop +; MIPS2-NEXT: divu $zero, $1, $6 +; MIPS2-NEXT: teq $6, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; +; MIPS32-LABEL: urem_i32: +; MIPS32: # %bb.0: # %entry +; MIPS32-NEXT: divu $zero, $4, $5 +; MIPS32-NEXT: teq $5, $zero, 7 +; MIPS32-NEXT: mfhi $1 +; MIPS32-NEXT: divu $zero, $1, $6 +; MIPS32-NEXT: teq $6, $zero, 7 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mfhi $2 +; +entry: + %urem = urem i32 %a, %b + %urem1 = urem i32 %urem, %c + ret i32 %urem1 +} + +define signext i64 @urem_i64(i64 signext %a, i64 signext %b, i64 signext %c) { +; MIPS3-LABEL: urem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: nop +; MIPS3-NEXT: nop +; MIPS3-NEXT: ddivu $zero, $1, $6 +; MIPS3-NEXT: teq $6, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; +; MIPS64-LABEL: urem_i64: +; MIPS64: # %bb.0: # %entry +; MIPS64-NEXT: ddivu $zero, $4, $5 +; MIPS64-NEXT: teq $5, $zero, 7 +; MIPS64-NEXT: mfhi $1 +; MIPS64-NEXT: ddivu $zero, $1, $6 +; MIPS64-NEXT: teq $6, $zero, 7 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mfhi $2 +; +entry: + %urem = urem i64 %a, %b + %urem1 = urem i64 %urem, %c + ret i64 %urem1 +} + +define signext i32 @urem_lw_urem_i32(i32 signext %a, i32 signext %b, i32 signext %c) { +; MIPS2-O0-LABEL: urem_lw_urem_i32: +; MIPS2-O0: # %bb.0: # %entry +; MIPS2-O0-NEXT: addiu $sp, $sp, -16 +; MIPS2-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS2-O0-NEXT: sw $4, 12($sp) +; MIPS2-O0-NEXT: sw $5, 8($sp) +; MIPS2-O0-NEXT: sw $6, 4($sp) +; MIPS2-O0-NEXT: lw $2, 12($sp) +; MIPS2-O0-NEXT: lw $1, 8($sp) +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: lw $1, 4($sp) +; MIPS2-O0-NEXT: nop +; MIPS2-O0-NEXT: divu $zero, $2, $1 +; MIPS2-O0-NEXT: teq $1, $zero, 7 +; MIPS2-O0-NEXT: mfhi $2 +; MIPS2-O0-NEXT: addiu $sp, $sp, 16 +; MIPS2-O0-NEXT: jr $ra +; MIPS2-O0-NEXT: nop +; +; MIPS32-O0-LABEL: urem_lw_urem_i32: +; MIPS32-O0: # %bb.0: # %entry +; MIPS32-O0-NEXT: addiu $sp, $sp, -16 +; MIPS32-O0-NEXT: .cfi_def_cfa_offset 16 +; MIPS32-O0-NEXT: sw $4, 12($sp) +; MIPS32-O0-NEXT: sw $5, 8($sp) +; MIPS32-O0-NEXT: sw $6, 4($sp) +; MIPS32-O0-NEXT: lw $2, 12($sp) +; MIPS32-O0-NEXT: lw $1, 8($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: lw $1, 4($sp) +; MIPS32-O0-NEXT: divu $zero, $2, $1 +; MIPS32-O0-NEXT: teq $1, $zero, 7 +; MIPS32-O0-NEXT: mfhi $2 +; MIPS32-O0-NEXT: addiu $sp, $sp, 16 +; MIPS32-O0-NEXT: jr $ra +; MIPS32-O0-NEXT: nop +; +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %c.addr = alloca i32, align 4 + store i32 %a, ptr %a.addr, align 4 + store i32 %b, ptr %b.addr, align 4 + store i32 %c, ptr %c.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %b.addr, align 4 + %rem = urem i32 %0, %1 + %2 = load i32, ptr %c.addr, align 4 + %urem1 = urem i32 %rem, %2 + ret i32 %urem1 +} + diff --git a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll index e3dd347e723b..cc2c6614e69c 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/udiv.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) { +; MIPS2-LABEL: udiv_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: move $2, $4 +; ; GP32-LABEL: udiv_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: move $2, $4 ; +; MIPS3-LABEL: udiv_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: move $2, $4 +; ; GP64-LABEL: udiv_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,14 @@ entry: } define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) { +; MIPS2-LABEL: udiv_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: udiv_i8: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -83,6 +101,14 @@ define zeroext i8 @udiv_i8(i8 zeroext %a, i8 zeroext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: udiv_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i8: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -114,6 +140,14 @@ entry: } define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) { +; MIPS2-LABEL: udiv_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: udiv_i16: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -127,6 +161,14 @@ define zeroext i16 @udiv_i16(i16 zeroext %a, i16 zeroext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: udiv_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i16: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -158,6 +200,14 @@ entry: } define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: udiv_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mflo $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: udiv_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -171,6 +221,14 @@ define signext i32 @udiv_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: udiv_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -202,6 +260,22 @@ entry: } define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: udiv_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__udivdi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: udiv_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -233,6 +307,14 @@ define signext i64 @udiv_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: udiv_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mflo $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: udiv_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddivu $zero, $4, $5 @@ -284,6 +366,30 @@ entry: } define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: udiv_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__udivti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: udiv_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -331,6 +437,25 @@ define signext i128 @udiv_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: udiv_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(udiv_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(udiv_i128))) +; MIPS3-NEXT: ld $25, %call16(__udivti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: udiv_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll index 4105d67da6f1..5da1f614b8f1 100644 --- a/llvm/test/CodeGen/Mips/llvm-ir/urem.ll +++ b/llvm/test/CodeGen/Mips/llvm-ir/urem.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=mips -mcpu=mips2 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP32,GP32R0R2 ; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 -relocation-model=pic \ @@ -13,9 +13,9 @@ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefix=GP32R6 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 -relocation-model=pic \ -; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 +; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=MIPS3 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 -relocation-model=pic \ ; RUN: -mips-jalr-reloc=false | FileCheck %s -check-prefixes=GP64,GP64R0R1 ; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 -relocation-model=pic \ @@ -35,6 +35,11 @@ ; RUN: FileCheck %s -check-prefix=MMR6 define signext i1 @urem_i1(i1 signext %a, i1 signext %b) { +; MIPS2-LABEL: urem_i1: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $2, $zero, 0 +; ; GP32-LABEL: urem_i1: ; GP32: # %bb.0: # %entry ; GP32-NEXT: jr $ra @@ -45,6 +50,11 @@ define signext i1 @urem_i1(i1 signext %a, i1 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $2, $zero, 0 ; +; MIPS3-LABEL: urem_i1: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: addiu $2, $zero, 0 +; ; GP64-LABEL: urem_i1: ; GP64: # %bb.0: # %entry ; GP64-NEXT: jr $ra @@ -70,6 +80,17 @@ entry: } define signext i8 @urem_i8(i8 signext %a, i8 signext %b) { +; MIPS2-LABEL: urem_i8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: andi $1, $5, 255 +; MIPS2-NEXT: andi $2, $4, 255 +; MIPS2-NEXT: divu $zero, $2, $1 +; MIPS2-NEXT: teq $1, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: sll $1, $1, 24 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 24 +; ; GP32R0R2-LABEL: urem_i8: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: andi $1, $5, 255 @@ -100,6 +121,17 @@ define signext i8 @urem_i8(i8 signext %a, i8 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seb $2, $2 ; +; MIPS3-LABEL: urem_i8: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: andi $1, $5, 255 +; MIPS3-NEXT: andi $2, $4, 255 +; MIPS3-NEXT: divu $zero, $2, $1 +; MIPS3-NEXT: teq $1, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: sll $1, $1, 24 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 24 +; ; GP64R0R1-LABEL: urem_i8: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: andi $1, $5, 255 @@ -154,6 +186,17 @@ entry: } define signext i16 @urem_i16(i16 signext %a, i16 signext %b) { +; MIPS2-LABEL: urem_i16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: andi $1, $5, 65535 +; MIPS2-NEXT: andi $2, $4, 65535 +; MIPS2-NEXT: divu $zero, $2, $1 +; MIPS2-NEXT: teq $1, $zero, 7 +; MIPS2-NEXT: mfhi $1 +; MIPS2-NEXT: sll $1, $1, 16 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: sra $2, $1, 16 +; ; GP32R0R2-LABEL: urem_i16: ; GP32R0R2: # %bb.0: # %entry ; GP32R0R2-NEXT: andi $1, $5, 65535 @@ -184,6 +227,17 @@ define signext i16 @urem_i16(i16 signext %a, i16 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: seh $2, $2 ; +; MIPS3-LABEL: urem_i16: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: andi $1, $5, 65535 +; MIPS3-NEXT: andi $2, $4, 65535 +; MIPS3-NEXT: divu $zero, $2, $1 +; MIPS3-NEXT: teq $1, $zero, 7 +; MIPS3-NEXT: mfhi $1 +; MIPS3-NEXT: sll $1, $1, 16 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: sra $2, $1, 16 +; ; GP64R0R1-LABEL: urem_i16: ; GP64R0R1: # %bb.0: # %entry ; GP64R0R1-NEXT: andi $1, $5, 65535 @@ -238,6 +292,14 @@ entry: } define signext i32 @urem_i32(i32 signext %a, i32 signext %b) { +; MIPS2-LABEL: urem_i32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: divu $zero, $4, $5 +; MIPS2-NEXT: teq $5, $zero, 7 +; MIPS2-NEXT: mfhi $2 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; GP32-LABEL: urem_i32: ; GP32: # %bb.0: # %entry ; GP32-NEXT: divu $zero, $4, $5 @@ -251,6 +313,14 @@ define signext i32 @urem_i32(i32 signext %a, i32 signext %b) { ; GP32R6-NEXT: teq $5, $zero, 7 ; GP32R6-NEXT: jrc $ra ; +; MIPS3-LABEL: urem_i32: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: divu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: urem_i32: ; GP64: # %bb.0: # %entry ; GP64-NEXT: divu $zero, $4, $5 @@ -282,6 +352,22 @@ entry: } define signext i64 @urem_i64(i64 signext %a, i64 signext %b) { +; MIPS2-LABEL: urem_i64: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -24 +; MIPS2-NEXT: .cfi_def_cfa_offset 24 +; MIPS2-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $25, %call16(__umoddi3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 24 +; ; GP32-LABEL: urem_i64: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -313,6 +399,14 @@ define signext i64 @urem_i64(i64 signext %a, i64 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 24 ; +; MIPS3-LABEL: urem_i64: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: ddivu $zero, $4, $5 +; MIPS3-NEXT: teq $5, $zero, 7 +; MIPS3-NEXT: mfhi $2 +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: nop +; ; GP64-LABEL: urem_i64: ; GP64: # %bb.0: # %entry ; GP64-NEXT: ddivu $zero, $4, $5 @@ -364,6 +458,30 @@ entry: } define signext i128 @urem_i128(i128 signext %a, i128 signext %b) { +; MIPS2-LABEL: urem_i128: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: lui $2, %hi(_gp_disp) +; MIPS2-NEXT: addiu $2, $2, %lo(_gp_disp) +; MIPS2-NEXT: addiu $sp, $sp, -40 +; MIPS2-NEXT: .cfi_def_cfa_offset 40 +; MIPS2-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS2-NEXT: .cfi_offset 31, -4 +; MIPS2-NEXT: addu $gp, $2, $25 +; MIPS2-NEXT: lw $1, 60($sp) +; MIPS2-NEXT: lw $2, 64($sp) +; MIPS2-NEXT: lw $3, 68($sp) +; MIPS2-NEXT: sw $3, 28($sp) +; MIPS2-NEXT: sw $2, 24($sp) +; MIPS2-NEXT: sw $1, 20($sp) +; MIPS2-NEXT: lw $1, 56($sp) +; MIPS2-NEXT: sw $1, 16($sp) +; MIPS2-NEXT: lw $25, %call16(__umodti3)($gp) +; MIPS2-NEXT: jalr $25 +; MIPS2-NEXT: nop +; MIPS2-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: addiu $sp, $sp, 40 +; ; GP32-LABEL: urem_i128: ; GP32: # %bb.0: # %entry ; GP32-NEXT: lui $2, %hi(_gp_disp) @@ -411,6 +529,25 @@ define signext i128 @urem_i128(i128 signext %a, i128 signext %b) { ; GP32R6-NEXT: jr $ra ; GP32R6-NEXT: addiu $sp, $sp, 40 ; +; MIPS3-LABEL: urem_i128: +; MIPS3: # %bb.0: # %entry +; MIPS3-NEXT: daddiu $sp, $sp, -16 +; MIPS3-NEXT: .cfi_def_cfa_offset 16 +; MIPS3-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS3-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS3-NEXT: .cfi_offset 31, -8 +; MIPS3-NEXT: .cfi_offset 28, -16 +; MIPS3-NEXT: lui $1, %hi(%neg(%gp_rel(urem_i128))) +; MIPS3-NEXT: daddu $1, $1, $25 +; MIPS3-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(urem_i128))) +; MIPS3-NEXT: ld $25, %call16(__umodti3)($gp) +; MIPS3-NEXT: jalr $25 +; MIPS3-NEXT: nop +; MIPS3-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS3-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS3-NEXT: jr $ra +; MIPS3-NEXT: daddiu $sp, $sp, 16 +; ; GP64-LABEL: urem_i128: ; GP64: # %bb.0: # %entry ; GP64-NEXT: daddiu $sp, $sp, -16 diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll new file mode 100644 index 000000000000..82eb5fb71677 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: fence_sc_cluster +define void @fence_sc_cluster() local_unnamed_addr { + ; CHECK: fence.sc.cluster + fence syncscope("cluster") seq_cst + ret void +} + +; CHECK-LABEL: fence_acq_rel_cluster +define void @fence_acq_rel_cluster() local_unnamed_addr { + ; CHECK: fence.acq_rel.cluster + fence syncscope("cluster") acq_rel + ret void +} + +; CHECK-LABEL: fence_release_cluster +define void @fence_release_cluster() local_unnamed_addr { + ; CHECK: fence.acq_rel.cluster + fence syncscope("cluster") release + ret void +} + +; CHECK-LABEL: fence_acquire_cluster +define void @fence_acquire_cluster() local_unnamed_addr { + ; CHECK: fence.acq_rel.cluster + fence syncscope("cluster") acquire + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll index d3aace95e966..626685f82f32 100644 --- a/llvm/test/CodeGen/NVPTX/fence.ll +++ b/llvm/test/CodeGen/NVPTX/fence.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70 ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %} +; TODO: implement and test thread scope. + ; CHECK-LABEL: fence_sc_sys define void @fence_sc_sys() local_unnamed_addr { ; SM60: membar.sys @@ -16,21 +18,85 @@ define void @fence_acq_rel_sys() local_unnamed_addr { ; SM60: membar.sys ; SM70: fence.acq_rel.sys fence acq_rel - ret void + ret void } ; CHECK-LABEL: fence_release_sys define void @fence_release_sys() local_unnamed_addr { ; SM60: membar.sys - ; SM70: fence.acq_rel.sys + ; SM70: fence.acq_rel.sys fence release - ret void + ret void } ; CHECK-LABEL: fence_acquire_sys define void @fence_acquire_sys() local_unnamed_addr { ; SM60: membar.sys - ; SM70: fence.acq_rel.sys + ; SM70: fence.acq_rel.sys fence acquire - ret void + ret void +} + +; CHECK-LABEL: fence_sc_gpu +define void @fence_sc_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.sc.gpu + fence syncscope("device") seq_cst + ret void +} + +; CHECK-LABEL: fence_acq_rel_gpu +define void @fence_acq_rel_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.acq_rel.gpu + fence syncscope("device") acq_rel + ret void +} + +; CHECK-LABEL: fence_release_gpu +define void @fence_release_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.acq_rel.gpu + fence syncscope("device") release + ret void +} + +; CHECK-LABEL: fence_acquire_gpu +define void @fence_acquire_gpu() local_unnamed_addr { + ; SM60: membar.gl + ; SM70: fence.acq_rel.gpu + fence syncscope("device") acquire + ret void +} + +; CHECK-LABEL: fence_sc_cta +define void @fence_sc_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.sc.cta + fence syncscope("block") seq_cst + ret void +} + +; CHECK-LABEL: fence_acq_rel_cta +define void @fence_acq_rel_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.acq_rel.cta + fence syncscope("block") acq_rel + ret void +} + +; CHECK-LABEL: fence_release_cta +define void @fence_release_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.acq_rel.cta + fence syncscope("block") release + ret void +} + +; CHECK-LABEL: fence_acquire_cta +define void @fence_acquire_cta() local_unnamed_addr { + ; SM60: membar.cta + ; SM70: fence.acq_rel.cta + fence syncscope("block") acquire + ret void }
\ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 9cea33d12027..4b200eacb0cf 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -1,10 +1,367 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s ; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} +; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" +; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile release": generates "st.release.sys;" +; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;" + +; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;" +; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;" +; but should generate "fence.sc.sys; st.relaxed.mmio.sys;" + +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: implement and test thread scope. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +; TODO: optimize .sys.shared into .cta.shared or .cluster.shared . + ;; generic statespace -; CHECK-LABEL: generic_acq_rel -define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_unordered_gpu +define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1 + + ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2 + + ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8 + + ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_volatile_gpu +define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_cta +define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1 + + ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2 + + ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8 + + ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_volatile_cta +define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_gpu +define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile_gpu +define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_cta +define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile_cta +define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_sys +define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -31,7 +388,7 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e release, align 4 @@ -44,8 +401,8 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam ret void } -; CHECK-LABEL: generic_acq_rel_volatile -define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_acq_rel_volatile_sys +define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -72,7 +429,7 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e release, align 4 @@ -85,8 +442,172 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo ret void } -; CHECK-LABEL: generic_sc -define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_acq_rel_gpu +define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_volatile_gpu +define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_cta +define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_volatile_cta +define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_sys +define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a seq_cst, align 1 @@ -122,7 +643,7 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e seq_cst, align 4 @@ -138,8 +659,8 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad ret void } -; CHECK-LABEL: generic_sc_volatile -define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_sc_volatile_sys +define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a seq_cst, align 1 @@ -175,7 +696,7 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e seq_cst, align 4 @@ -191,10 +712,550 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ret void } +; CHECK-LABEL: generic_sc_gpu +define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_volatile_gpu +define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_cta +define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_volatile_cta +define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8 + + ret void +} + ;; global statespace -; CHECK-LABEL: global_acq_rel -define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_unordered_gpu +define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_unordered_volatile_gpu +define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_unordered_cta +define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_unordered_volatile_cta +define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_gpu +define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile_gpu +define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_cta +define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile_cta +define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_sys +define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -221,7 +1282,7 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e release, align 4 @@ -234,8 +1295,8 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ret void } -; CHECK-LABEL: global_acq_rel_volatile -define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_acq_rel_volatile_sys +define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -262,7 +1323,7 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4 @@ -275,8 +1336,172 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ret void } -; CHECK-LABEL: global_seq_cst -define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_acq_rel_gpu +define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_gpu +define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_cta +define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_cta +define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_sys +define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 @@ -312,7 +1537,7 @@ define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 @@ -328,8 +1553,8 @@ define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp ret void } -; CHECK-LABEL: global_seq_cst_volatile -define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_seq_cst_volatile_sys +define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 @@ -365,7 +1590,7 @@ define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 @@ -381,10 +1606,550 @@ define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p ret void } +; CHECK-LABEL: global_seq_cst_gpu +define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_gpu +define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_cta +define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_cta +define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 + + ret void +} + ;; shared statespace -; CHECK-LABEL: shared_acq_rel -define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_unordered_gpu +define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_volatile_gpu +define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_cta +define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_volatile_cta +define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_gpu +define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile_gpu +define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_cta +define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile_cta +define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_sys +define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -411,7 +2176,7 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e release, align 4 @@ -424,8 +2189,8 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ret void } -; CHECK-LABEL: shared_acq_rel_volatile -define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_acq_rel_volatile_sys +define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -452,7 +2217,7 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4 @@ -465,8 +2230,172 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ret void } -; CHECK-LABEL: shared_seq_cst -define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_acq_rel_gpu +define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_gpu +define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_cta +define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_cta +define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_sys +define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 @@ -502,7 +2431,7 @@ define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 @@ -510,16 +2439,16 @@ define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 - %f.add = fadd double %f.load, 1. - ; CHECK: fence.sc.sys + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void } -; CHECK-LABEL: shared_seq_cst_volatile -define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_seq_cst_volatile_sys +define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 @@ -555,7 +2484,7 @@ define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ; CHECK: fence.sc.sys ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 @@ -571,13 +2500,550 @@ define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p ret void } +; CHECK-LABEL: shared_seq_cst_gpu +define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.gpu + ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.gpu + ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_gpu +define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_cta +define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.cta + ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cta + ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_cta +define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 + + ret void +} + ;; local statespace -; CHECK-LABEL: local_acq_rel -define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. +; CHECK-LABEL: local_unordered_gpu +define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_volatile_gpu +define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_cta +define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_volatile_cta +define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_gpu +define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile_gpu +define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4 + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_cta +define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile_cta +define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_sys +define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -604,7 +3070,7 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e release, align 4 @@ -617,11 +3083,8 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ret void } -; CHECK-LABEL: local_acq_rel_volatile -define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_acq_rel_volatile_sys +define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 @@ -648,7 +3111,7 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 @@ -661,11 +3124,172 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ret void } -; CHECK-LABEL: local_seq_cst -define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. +; CHECK-LABEL: local_acq_rel_gpu +define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1 + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile_gpu +define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_cta +define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile_cta +define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_sys +define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 @@ -692,7 +3316,7 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4 @@ -705,11 +3329,8 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa ret void } -; CHECK-LABEL: local_seq_cst_volatile -define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_seq_cst_volatile_sys +define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 @@ -736,7 +3357,7 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4 - %e.add = fadd float %e.load, 1.0 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4 @@ -746,10 +3367,169 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8 - ; TODO: LLVM IR Verifier does not support atomics on vector types. + ret void +} + +; CHECK-LABEL: local_seq_cst_gpu +define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 ret void } -; TODO: add plain,atomic,volatile,atomic volatile tests -; for .const and .param statespaces
\ No newline at end of file +; CHECK-LABEL: local_seq_cst_volatile_gpu +define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_cta +define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_volatile_cta +define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll new file mode 100644 index 000000000000..645170da51a0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll @@ -0,0 +1,1423 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s +; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} + +; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;" +; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile release": generates "st.release.sys;" +; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;" + +; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;" +; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;" +; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;" +; but should generate "fence.sc.sys; st.relaxed.mmio.sys;" + +; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: implement and test thread scope. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +; TODO: optimize .shared.sys into .shared.cta or .shared.cluster . + +;; generic statespace + +; CHECK-LABEL: generic_unordered_cluster +define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_unordered_volatile_cluster +define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_cluster +define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_monotonic_volatile_cluster +define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_cluster +define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_acq_rel_volatile_cluster +define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_cluster +define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: generic_sc_volatile_cluster +define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +;; global statespace + +; CHECK-LABEL: global_unordered_cluster +define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_unordered_volatile_cluster +define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_cluster +define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_monotonic_volatile_cluster +define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_cluster +define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: global_acq_rel_volatile_cluster +define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_cluster +define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: global_seq_cst_volatile_cluster +define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +;; shared + +; CHECK-LABEL: shared_unordered_cluster +define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_unordered_volatile_cluster +define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_cluster +define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_monotonic_volatile_cluster +define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_cluster +define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_acq_rel_volatile_cluster +define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_cluster +define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.cluster + ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.cluster + ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: shared_seq_cst_volatile_cluster +define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: fence.sc.sys + ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: fence.sc.sys + ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +;; local statespace + +; CHECK-LABEL: local_unordered_cluster +define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_unordered_volatile_cluster +define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_cluster +define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_monotonic_volatile_cluster +define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_cluster +define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: local_acq_rel_volatile_cluster +define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_cluster +define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + + ret void +} + +; CHECK-LABEL: local_seq_cst_volatile_cluster +define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { + ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + %a.add = add i8 %a.load, 1 + ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 + + ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + %b.add = add i16 %b.load, 1 + ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 + + ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + %c.add = add i32 %c.load, 1 + ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + %d.add = add i64 %d.load, 1 + ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 + + ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + %e.add = fadd float %e.load, 1. + ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 + + ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + %f.add = fadd double %f.load, 1. + ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index aac73f71a676..f922fd92fa24 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -9,10 +9,21 @@ ; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +; TODO: optimize .sys.shared into .cta.shared or .cluster.shared . + ; generic statespace -; CHECK-LABEL: generic_plain -define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { +; CHECK-LABEL: generic_weak +define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr { ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 @@ -238,198 +249,198 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr ret void } -; CHECK-LABEL: generic_monotonic -define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_unordered_sys +define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a monotonic, align 1 + %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a monotonic, align 1 + store atomic i8 %a.add, ptr %a unordered, align 1 ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b monotonic, align 2 + %b.load = load atomic i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b monotonic, align 2 + store atomic i16 %b.add, ptr %b unordered, align 2 ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c monotonic, align 4 + %c.load = load atomic i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c monotonic, align 4 + store atomic i32 %c.add, ptr %c unordered, align 4 ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d monotonic, align 8 + %d.load = load atomic i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d monotonic, align 8 + store atomic i64 %d.add, ptr %d unordered, align 8 ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e monotonic, align 4 + %e.load = load atomic float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e monotonic, align 4 + store atomic float %e.add, ptr %e unordered, align 4 ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e monotonic, align 8 + %f.load = load atomic double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e monotonic, align 8 + store atomic double %f.add, ptr %e unordered, align 8 ret void } -; CHECK-LABEL: generic_monotonic_volatile -define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_unordered_volatile_sys +define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr %a unordered, align 1 ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr %b unordered, align 2 ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr %c unordered, align 4 ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr %d unordered, align 8 ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.load = load atomic volatile float, ptr %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e monotonic, align 4 + store atomic volatile float %e.add, ptr %e unordered, align 4 ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e monotonic, align 8 + %f.load = load atomic volatile double, ptr %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e monotonic, align 8 + store atomic volatile double %f.add, ptr %e unordered, align 8 ret void } -; CHECK-LABEL: generic_unordered -define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_monotonic_sys +define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr %a unordered, align 1 + %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr %a unordered, align 1 + store atomic i8 %a.add, ptr %a monotonic, align 1 ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr %b unordered, align 2 + %b.load = load atomic i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr %b unordered, align 2 + store atomic i16 %b.add, ptr %b monotonic, align 2 ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr %c unordered, align 4 + %c.load = load atomic i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr %c unordered, align 4 + store atomic i32 %c.add, ptr %c monotonic, align 4 ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr %d unordered, align 8 + %d.load = load atomic i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr %d unordered, align 8 + store atomic i64 %d.add, ptr %d monotonic, align 8 ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr %e unordered, align 4 + store atomic float %e.add, ptr %e monotonic, align 4 ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr %e unordered, align 8 + %f.load = load atomic double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr %e unordered, align 8 + store atomic double %f.add, ptr %e monotonic, align 8 ret void } -; CHECK-LABEL: generic_unordered_volatile -define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { +; CHECK-LABEL: generic_monotonic_volatile_sys +define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr %a unordered, align 1 + %a.load = load atomic volatile i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr %a unordered, align 1 + store atomic volatile i8 %a.add, ptr %a monotonic, align 1 ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr %b unordered, align 2 + %b.load = load atomic volatile i16, ptr %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr %b unordered, align 2 + store atomic volatile i16 %b.add, ptr %b monotonic, align 2 ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr %c unordered, align 4 + %c.load = load atomic volatile i32, ptr %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr %c unordered, align 4 + store atomic volatile i32 %c.add, ptr %c monotonic, align 4 ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr %d unordered, align 8 + %d.load = load atomic volatile i64, ptr %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr %d unordered, align 8 + store atomic volatile i64 %d.add, ptr %d monotonic, align 8 ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr %e unordered, align 4 + store atomic volatile float %e.add, ptr %e monotonic, align 4 ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr %e unordered, align 8 + %f.load = load atomic volatile double, ptr %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr %e unordered, align 8 + store atomic volatile double %f.add, ptr %e monotonic, align 8 ret void } ;; global statespace -; CHECK-LABEL: global_plain -define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { +; CHECK-LABEL: global_weak +define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr { ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 @@ -630,222 +641,222 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs ret void } -; CHECK-LABEL: global_monotonic -define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_unordered_sys +define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 ret void } -; CHECK-LABEL: global_monotonic_volatile -define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_unordered_volatile_sys +define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 ret void } -; CHECK-LABEL: global_unordered -define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_monotonic_sys +define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 + %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1 + store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2 + %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2 + store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4 + %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4 + store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8 + %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8 + store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(1) %e unordered, align 4 + store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8 + %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(1) %e unordered, align 8 + store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void } -; CHECK-LABEL: global_unordered_volatile -define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { +; CHECK-LABEL: global_monotonic_volatile_sys +define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 + %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1 + store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1 ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2 + %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2 + store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2 ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4 + %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4 + store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4 ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8 + %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8 + store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8 ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4 + store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4 ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8 + %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8 + store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8 ret void } ;; shared statespace -; CHECK-LABEL: shared_plain -define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { +; CHECK-LABEL: shared_weak +define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr { ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 @@ -1046,202 +1057,198 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs ret void } -; CHECK-LABEL: shared_monotonic -define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. - +; CHECK-LABEL: shared_unordered_sys +define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 ret void } -; CHECK-LABEL: shared_monotonic_volatile -define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_unordered_volatile_sys +define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 ret void } -; CHECK-LABEL: shared_unordered -define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared. - +; CHECK-LABEL: shared_monotonic_sys +define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 + %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1 + store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1 ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2 + %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2 + store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2 ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4 + %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4 + store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4 ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8 + %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8 + store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8 ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(3) %e unordered, align 4 + store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4 ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8 + %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(3) %e unordered, align 8 + store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8 ret void } -; CHECK-LABEL: shared_unordered_volatile -define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { +; CHECK-LABEL: shared_monotonic_volatile_sys +define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 + %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1 + store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1 ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2 + %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2 + store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2 ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4 + %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4 + store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4 ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8 + %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8 + store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8 ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4 + store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4 ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8 + %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8 + store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8 ret void } ;; local statespace -; CHECK-LABEL: local_plain -define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { +; CHECK-LABEL: local_weak +define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -1343,9 +1350,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace ; CHECK-LABEL: local_volatile define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using volatile operations. - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -1445,175 +1449,166 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp ret void } -; CHECK-LABEL: local_monotonic -define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by using PTX atomic operations. - +; CHECK-LABEL: local_unordered_sys +define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 ret void } -; CHECK-LABEL: local_monotonic_volatile -define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; TODO: generate PTX that preserves Concurrent Forward Progress - ; by generating atomic or volatile operations - +; CHECK-LABEL: local_unordered_volatile_sys +define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 %e.add = fadd float %e.load, 1.0 ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 ret void } -; CHECK-LABEL: local_unordered -define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_monotonic_sys +define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 + %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1 + store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2 + %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2 + store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4 + %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4 + store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8 + %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8 + store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic float %e.add, ptr addrspace(5) %e unordered, align 4 + store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8 + %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic double %f.add, ptr addrspace(5) %e unordered, align 8 + store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8 ret void } -; CHECK-LABEL: local_unordered_volatile -define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { +; CHECK-LABEL: local_monotonic_volatile +define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 + %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1 + store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1 ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] - %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2 + %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2 %b.add = add i16 %b.load, 1 ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} - store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2 + store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2 ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] - %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4 + %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4 %c.add = add i32 %c.load, 1 ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} - store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4 + store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4 ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] - %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8 + %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8 %d.add = add i64 %d.load, 1 ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} - store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8 + store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8 ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] - %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4 - %e.add = fadd float %e.load, 1.0 + %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4 + %e.add = fadd float %e.load, 1. ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} - store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4 + store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4 ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] - %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8 + %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8 %f.add = fadd double %f.load, 1. ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} - store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8 + store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8 ret void } - -; TODO: add plain,atomic,volatile,atomic volatile tests -; for .const and .param statespaces
\ No newline at end of file diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index 20c7ae5908d2..9ec5bcd13403 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -9,26 +9,29 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32) declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) +declare i64 @llvm.fshl.i64(i64, i64, i64) +declare i64 @llvm.fshr.i64(i64, i64, i64) +declare i32 @llvm.fshl.i32(i32, i32, i32) +declare i32 @llvm.fshr.i32(i32, i32, i32) + + ; SM20: rotate32 ; SM35: rotate32 define i32 @rotate32(i32 %a, i32 %b) { ; SM20-LABEL: rotate32( ; SM20: { -; SM20-NEXT: .reg .b32 %r<4>; +; SM20-NEXT: .reg .b32 %r<9>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0]; ; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b32 %lhs; -; SM20-NEXT: .reg .b32 %rhs; -; SM20-NEXT: .reg .b32 %amt2; -; SM20-NEXT: shl.b32 %lhs, %r1, %r2; -; SM20-NEXT: sub.s32 %amt2, 32, %r2; -; SM20-NEXT: shr.b32 %rhs, %r1, %amt2; -; SM20-NEXT: add.u32 %r3, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b32 [func_retval0+0], %r3; +; SM20-NEXT: and.b32 %r3, %r2, 31; +; SM20-NEXT: shl.b32 %r4, %r1, %r3; +; SM20-NEXT: neg.s32 %r5, %r2; +; SM20-NEXT: and.b32 %r6, %r5, 31; +; SM20-NEXT: shr.u32 %r7, %r1, %r6; +; SM20-NEXT: or.b32 %r8, %r4, %r7; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r8; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate32( @@ -50,45 +53,36 @@ define i32 @rotate32(i32 %a, i32 %b) { define i64 @rotate64(i64 %a, i32 %b) { ; SM20-LABEL: rotate64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotate64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<6>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1]; -; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; -; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3; -; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ld.param.u32 %r1, [rotate64_param_1]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b) ret i64 %val @@ -99,45 +93,36 @@ define i64 @rotate64(i64 %a, i32 %b) { define i64 @rotateright64(i64 %a, i32 %b) { ; SM20-LABEL: rotateright64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotateright64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<6>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: { -; SM35-NEXT: .reg .b32 %dummy; -; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1; -; SM35-NEXT: } -; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1]; -; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; -; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3; -; SM35-NEXT: mov.b64 %rd2, {%r5, %r4}; -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b) ret i64 %val @@ -148,18 +133,14 @@ define i64 @rotateright64(i64 %a, i32 %b) { define i32 @rotl0(i32 %x) { ; SM20-LABEL: rotl0( ; SM20: { -; SM20-NEXT: .reg .b32 %r<3>; +; SM20-NEXT: .reg .b32 %r<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b32 %lhs; -; SM20-NEXT: .reg .b32 %rhs; -; SM20-NEXT: shl.b32 %lhs, %r1, 8; -; SM20-NEXT: shr.b32 %rhs, %r1, 24; -; SM20-NEXT: add.u32 %r2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b32 [func_retval0+0], %r2; +; SM20-NEXT: shr.u32 %r2, %r1, 24; +; SM20-NEXT: shl.b32 %r3, %r1, 8; +; SM20-NEXT: or.b32 %r4, %r3, %r2; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl0( @@ -177,51 +158,40 @@ define i32 @rotl0(i32 %x) { ret i32 %t2 } -declare i64 @llvm.fshl.i64(i64, i64, i64) -declare i64 @llvm.fshr.i64(i64, i64, i64) - ; SM35: rotl64 define i64 @rotl64(i64 %a, i64 %n) { ; SM20-LABEL: rotl64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<2>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; ; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: .reg .u32 %amt2; -; SM35-NEXT: and.b32 %amt2, %r1, 63; -; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2; -; SM35-NEXT: sub.u32 %amt2, 64, %amt2; -; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shr.u64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -231,34 +201,26 @@ define i64 @rotl64(i64 %a, i64 %n) { define i64 @rotl64_imm(i64 %a) { ; SM20-LABEL: rotl64_imm( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: shl.b64 %lhs, %rd1, 2; -; SM20-NEXT: shr.b64 %rhs, %rd1, 62; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: shr.u64 %rd2, %rd1, 62; +; SM20-NEXT: shl.b64 %rd3, %rd1, 2; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotl64_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: shl.b64 %lhs, %rd1, 2; -; SM35-NEXT: shr.b64 %rhs, %rd1, 62; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: shr.u64 %rd2, %rd1, 62; +; SM35-NEXT: shl.b64 %rd3, %rd1, 2; +; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66) ret i64 %val @@ -268,44 +230,36 @@ define i64 @rotl64_imm(i64 %a) { define i64 @rotr64(i64 %a, i64 %n) { ; SM20-LABEL: rotr64( ; SM20: { -; SM20-NEXT: .reg .b32 %r<2>; -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; ; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: .reg .u32 %amt2; -; SM20-NEXT: and.b32 %amt2, %r1, 63; -; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2; -; SM20-NEXT: sub.u32 %amt2, 64, %amt2; -; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM20-NEXT: neg.s32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64( ; SM35: { -; SM35-NEXT: .reg .b32 %r<2>; -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; ; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: .reg .u32 %amt2; -; SM35-NEXT: and.b32 %amt2, %r1, 63; -; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2; -; SM35-NEXT: sub.u32 %amt2, 64, %amt2; -; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; +; SM35-NEXT: neg.s32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shl.b64 %rd3, %rd1, %r4; +; SM35-NEXT: or.b64 %rd4, %rd2, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n) ret i64 %val @@ -315,35 +269,180 @@ define i64 @rotr64(i64 %a, i64 %n) { define i64 @rotr64_imm(i64 %a) { ; SM20-LABEL: rotr64_imm( ; SM20: { -; SM20-NEXT: .reg .b64 %rd<3>; +; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM20-NEXT: { -; SM20-NEXT: .reg .b64 %lhs; -; SM20-NEXT: .reg .b64 %rhs; -; SM20-NEXT: shl.b64 %lhs, %rd1, 62; -; SM20-NEXT: shr.b64 %rhs, %rd1, 2; -; SM20-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM20-NEXT: } -; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM20-NEXT: shl.b64 %rd2, %rd1, 62; +; SM20-NEXT: shr.u64 %rd3, %rd1, 2; +; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM20-NEXT: ret; ; ; SM35-LABEL: rotr64_imm( ; SM35: { -; SM35-NEXT: .reg .b64 %rd<3>; +; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0]; -; SM35-NEXT: { -; SM35-NEXT: .reg .b64 %lhs; -; SM35-NEXT: .reg .b64 %rhs; -; SM35-NEXT: shl.b64 %lhs, %rd1, 62; -; SM35-NEXT: shr.b64 %rhs, %rd1, 2; -; SM35-NEXT: add.u64 %rd2, %lhs, %rhs; -; SM35-NEXT: } -; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2; +; SM35-NEXT: shl.b64 %rd2, %rd1, 62; +; SM35-NEXT: shr.u64 %rd3, %rd1, 2; +; SM35-NEXT: or.b64 %rd4, %rd3, %rd2; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4; ; SM35-NEXT: ret; %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66) ret i64 %val } + +define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { +; SM20-LABEL: funnel_shift_right_32( +; SM20: { +; SM20-NEXT: .reg .b32 %r<11>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; +; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_2]; +; SM20-NEXT: and.b32 %r3, %r2, 31; +; SM20-NEXT: ld.param.u32 %r4, [funnel_shift_right_32_param_1]; +; SM20-NEXT: shr.u32 %r5, %r4, %r3; +; SM20-NEXT: shl.b32 %r6, %r1, 1; +; SM20-NEXT: not.b32 %r7, %r2; +; SM20-NEXT: and.b32 %r8, %r7, 31; +; SM20-NEXT: shl.b32 %r9, %r6, %r8; +; SM20-NEXT: or.b32 %r10, %r9, %r5; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_right_32( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; +; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1]; +; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2]; +; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, %r3; +; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM35-NEXT: ret; + %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c) + ret i32 %val +} + +define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { +; SM20-LABEL: funnel_shift_left_32( +; SM20: { +; SM20-NEXT: .reg .b32 %r<11>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; +; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_2]; +; SM20-NEXT: and.b32 %r3, %r2, 31; +; SM20-NEXT: shl.b32 %r4, %r1, %r3; +; SM20-NEXT: ld.param.u32 %r5, [funnel_shift_left_32_param_1]; +; SM20-NEXT: shr.u32 %r6, %r5, 1; +; SM20-NEXT: not.b32 %r7, %r2; +; SM20-NEXT: and.b32 %r8, %r7, 31; +; SM20-NEXT: shr.u32 %r9, %r6, %r8; +; SM20-NEXT: or.b32 %r10, %r4, %r9; +; SM20-NEXT: st.param.b32 [func_retval0+0], %r10; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_left_32( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; +; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1]; +; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2]; +; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, %r3; +; SM35-NEXT: st.param.b32 [func_retval0+0], %r4; +; SM35-NEXT: ret; + %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) + ret i32 %val +} + +define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { +; SM20-LABEL: funnel_shift_right_64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<7>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; +; SM20-NEXT: shr.u64 %rd3, %rd2, %r2; +; SM20-NEXT: shl.b64 %rd4, %rd1, 1; +; SM20-NEXT: not.b32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shl.b64 %rd5, %rd4, %r4; +; SM20-NEXT: or.b64 %rd6, %rd5, %rd3; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_right_64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<7>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; +; SM35-NEXT: shr.u64 %rd3, %rd2, %r2; +; SM35-NEXT: shl.b64 %rd4, %rd1, 1; +; SM35-NEXT: not.b32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shl.b64 %rd5, %rd4, %r4; +; SM35-NEXT: or.b64 %rd6, %rd5, %rd3; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) + ret i64 %val +} + +define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { +; SM20-LABEL: funnel_shift_left_64( +; SM20: { +; SM20-NEXT: .reg .b32 %r<5>; +; SM20-NEXT: .reg .b64 %rd<7>; +; SM20-EMPTY: +; SM20-NEXT: // %bb.0: +; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; +; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; +; SM20-NEXT: and.b32 %r2, %r1, 63; +; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM20-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; +; SM20-NEXT: shr.u64 %rd4, %rd3, 1; +; SM20-NEXT: not.b32 %r3, %r1; +; SM20-NEXT: and.b32 %r4, %r3, 63; +; SM20-NEXT: shr.u64 %rd5, %rd4, %r4; +; SM20-NEXT: or.b64 %rd6, %rd2, %rd5; +; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM20-NEXT: ret; +; +; SM35-LABEL: funnel_shift_left_64( +; SM35: { +; SM35-NEXT: .reg .b32 %r<5>; +; SM35-NEXT: .reg .b64 %rd<7>; +; SM35-EMPTY: +; SM35-NEXT: // %bb.0: +; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; +; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; +; SM35-NEXT: and.b32 %r2, %r1, 63; +; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; +; SM35-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; +; SM35-NEXT: shr.u64 %rd4, %rd3, 1; +; SM35-NEXT: not.b32 %r3, %r1; +; SM35-NEXT: and.b32 %r4, %r3, 63; +; SM35-NEXT: shr.u64 %rd5, %rd4, %r4; +; SM35-NEXT: or.b64 %rd6, %rd2, %rd5; +; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6; +; SM35-NEXT: ret; + %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c) + ret i64 %val +} + diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll index 64659ce1b5c5..05fdb02ac747 100644 --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -1,25 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} declare i64 @llvm.nvvm.rotate.b64(i64, i32) declare i64 @llvm.nvvm.rotate.right.b64(i64, i32) -; CHECK: rotate64 define i64 @rotate64(i64 %a, i32 %b) { -; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 3; -; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61; -; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]]; -; CHECK: ret +; CHECK-LABEL: rotate64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; +; CHECK-NEXT: shr.u64 %rd2, %rd1, 61; +; CHECK-NEXT: shl.b64 %rd3, %rd1, 3; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3) ret i64 %val } -; CHECK: rotateright64 define i64 @rotateright64(i64 %a, i32 %b) { -; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 61; -; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 3; -; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]]; -; CHECK: ret +; CHECK-LABEL: rotateright64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; +; CHECK-NEXT: shl.b64 %rd2, %rd1, 61; +; CHECK-NEXT: shr.u64 %rd3, %rd1, 3; +; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4; +; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3) ret i64 %val } diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll index c48361e0a803..72de456cba39 100644 --- a/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll +++ b/llvm/test/CodeGen/PowerPC/ctrloop-sh.ll @@ -8,58 +8,52 @@ define void @foo1(ptr %a, ptr readonly %b, ptr readonly %c) #0 { ; CHECK-LABEL: foo1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: stwu 1, -64(1) -; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; CHECK-NEXT: li 8, 2048 ; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: li 7, 2048 ; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; CHECK-NEXT: li 7, 7 -; CHECK-NEXT: mtctr 8 -; CHECK-NEXT: addi 8, 1, 16 +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: mtctr 7 +; CHECK-NEXT: addi 7, 1, 16 ; CHECK-NEXT: .LBB0_1: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lwz 9, 0(4) -; CHECK-NEXT: lwz 10, 4(4) -; CHECK-NEXT: lwz 11, 8(4) -; CHECK-NEXT: lwz 12, 12(4) -; CHECK-NEXT: lwz 0, 12(5) +; CHECK-NEXT: lwz 8, 0(4) +; CHECK-NEXT: lwz 9, 4(4) +; CHECK-NEXT: lwz 10, 8(4) +; CHECK-NEXT: lwz 11, 12(4) +; CHECK-NEXT: lwz 12, 12(5) ; CHECK-NEXT: stw 6, 44(1) ; CHECK-NEXT: stw 6, 40(1) ; CHECK-NEXT: stw 6, 36(1) ; CHECK-NEXT: stw 6, 32(1) -; CHECK-NEXT: stw 12, 28(1) -; CHECK-NEXT: clrlwi 12, 0, 29 -; CHECK-NEXT: stw 11, 24(1) -; CHECK-NEXT: nand 11, 0, 7 -; CHECK-NEXT: stw 10, 20(1) -; CHECK-NEXT: subfic 29, 12, 32 -; CHECK-NEXT: stw 9, 16(1) -; CHECK-NEXT: rlwinm 9, 0, 29, 28, 31 -; CHECK-NEXT: lwzux 10, 9, 8 -; CHECK-NEXT: clrlwi 11, 11, 27 -; CHECK-NEXT: lwz 0, 8(9) -; CHECK-NEXT: slw 10, 10, 12 -; CHECK-NEXT: lwz 30, 4(9) -; CHECK-NEXT: lwz 9, 12(9) -; CHECK-NEXT: slw 28, 30, 12 -; CHECK-NEXT: srw 30, 30, 29 -; CHECK-NEXT: srw 29, 9, 29 -; CHECK-NEXT: slw 9, 9, 12 -; CHECK-NEXT: slw 12, 0, 12 -; CHECK-NEXT: srwi 0, 0, 1 -; CHECK-NEXT: stw 9, 12(3) -; CHECK-NEXT: or 9, 12, 29 -; CHECK-NEXT: srw 11, 0, 11 -; CHECK-NEXT: stw 9, 8(3) -; CHECK-NEXT: or 9, 10, 30 -; CHECK-NEXT: stw 9, 0(3) -; CHECK-NEXT: or 9, 28, 11 -; CHECK-NEXT: stw 9, 4(3) +; CHECK-NEXT: stw 11, 28(1) +; CHECK-NEXT: stw 10, 24(1) +; CHECK-NEXT: clrlwi 10, 12, 27 +; CHECK-NEXT: stw 9, 20(1) +; CHECK-NEXT: stw 8, 16(1) +; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29 +; CHECK-NEXT: lwzux 9, 8, 7 +; CHECK-NEXT: subfic 12, 10, 32 +; CHECK-NEXT: lwz 11, 8(8) +; CHECK-NEXT: slw 9, 9, 10 +; CHECK-NEXT: lwz 0, 4(8) +; CHECK-NEXT: lwz 8, 12(8) +; CHECK-NEXT: srw 30, 11, 12 +; CHECK-NEXT: slw 29, 0, 10 +; CHECK-NEXT: srw 0, 0, 12 +; CHECK-NEXT: srw 12, 8, 12 +; CHECK-NEXT: slw 11, 11, 10 +; CHECK-NEXT: slw 8, 8, 10 +; CHECK-NEXT: stw 8, 12(3) +; CHECK-NEXT: or 8, 11, 12 +; CHECK-NEXT: stw 8, 8(3) +; CHECK-NEXT: or 8, 9, 0 +; CHECK-NEXT: stw 8, 0(3) +; CHECK-NEXT: or 8, 29, 30 +; CHECK-NEXT: stw 8, 4(3) ; CHECK-NEXT: bdnz .LBB0_1 ; CHECK-NEXT: # %bb.2: # %for.end ; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload ; CHECK-NEXT: addi 1, 1, 64 ; CHECK-NEXT: blr entry: @@ -83,59 +77,53 @@ for.end: ; preds = %for.body define void @foo2(ptr %a, ptr readonly %b, ptr readonly %c) #0 { ; CHECK-LABEL: foo2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stwu 1, -64(1) -; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; CHECK-NEXT: li 7, 2048 -; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; CHECK-NEXT: li 6, 7 -; CHECK-NEXT: mtctr 7 -; CHECK-NEXT: addi 7, 1, 36 +; CHECK-NEXT: stwu 1, -48(1) +; CHECK-NEXT: stw 30, 40(1) # 4-byte Folded Spill +; CHECK-NEXT: li 6, 2048 +; CHECK-NEXT: mtctr 6 +; CHECK-NEXT: addi 6, 1, 24 ; CHECK-NEXT: .LBB1_1: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lwz 8, 0(4) -; CHECK-NEXT: lwz 10, 8(4) -; CHECK-NEXT: lwz 12, 12(5) -; CHECK-NEXT: lwz 9, 4(4) -; CHECK-NEXT: lwz 11, 12(4) -; CHECK-NEXT: stw 10, 44(1) -; CHECK-NEXT: rlwinm 10, 12, 29, 28, 31 -; CHECK-NEXT: stw 8, 36(1) -; CHECK-NEXT: srawi 8, 8, 31 -; CHECK-NEXT: stw 11, 48(1) -; CHECK-NEXT: clrlwi 11, 12, 29 -; CHECK-NEXT: stw 9, 40(1) -; CHECK-NEXT: nand 9, 12, 6 -; CHECK-NEXT: stw 8, 32(1) -; CHECK-NEXT: subfic 30, 11, 32 +; CHECK-NEXT: lwz 7, 0(4) +; CHECK-NEXT: lwz 8, 4(4) +; CHECK-NEXT: lwz 11, 12(5) +; CHECK-NEXT: lwz 9, 8(4) +; CHECK-NEXT: lwz 10, 12(4) ; CHECK-NEXT: stw 8, 28(1) -; CHECK-NEXT: clrlwi 9, 9, 27 -; CHECK-NEXT: stw 8, 24(1) -; CHECK-NEXT: stw 8, 20(1) -; CHECK-NEXT: sub 8, 7, 10 -; CHECK-NEXT: lwz 10, 4(8) -; CHECK-NEXT: lwz 12, 8(8) -; CHECK-NEXT: lwz 0, 0(8) -; CHECK-NEXT: lwz 8, 12(8) -; CHECK-NEXT: srw 29, 12, 11 -; CHECK-NEXT: slw 12, 12, 30 -; CHECK-NEXT: slw 30, 0, 30 -; CHECK-NEXT: srw 8, 8, 11 -; CHECK-NEXT: sraw 0, 0, 11 -; CHECK-NEXT: srw 11, 10, 11 -; CHECK-NEXT: slwi 10, 10, 1 -; CHECK-NEXT: or 8, 12, 8 -; CHECK-NEXT: slw 9, 10, 9 -; CHECK-NEXT: stw 8, 12(3) -; CHECK-NEXT: or 8, 30, 11 -; CHECK-NEXT: stw 8, 4(3) -; CHECK-NEXT: or 8, 29, 9 -; CHECK-NEXT: stw 0, 0(3) -; CHECK-NEXT: stw 8, 8(3) +; CHECK-NEXT: rlwinm 8, 11, 29, 28, 29 +; CHECK-NEXT: stw 7, 24(1) +; CHECK-NEXT: srawi 7, 7, 31 +; CHECK-NEXT: stw 10, 36(1) +; CHECK-NEXT: clrlwi 10, 11, 27 +; CHECK-NEXT: stw 9, 32(1) +; CHECK-NEXT: subfic 12, 10, 32 +; CHECK-NEXT: stw 7, 20(1) +; CHECK-NEXT: stw 7, 16(1) +; CHECK-NEXT: stw 7, 12(1) +; CHECK-NEXT: stw 7, 8(1) +; CHECK-NEXT: sub 7, 6, 8 +; CHECK-NEXT: lwz 8, 4(7) +; CHECK-NEXT: lwz 9, 0(7) +; CHECK-NEXT: lwz 11, 12(7) +; CHECK-NEXT: srw 0, 8, 10 +; CHECK-NEXT: lwz 7, 8(7) +; CHECK-NEXT: slw 30, 9, 12 +; CHECK-NEXT: slw 8, 8, 12 +; CHECK-NEXT: srw 11, 11, 10 +; CHECK-NEXT: slw 12, 7, 12 +; CHECK-NEXT: srw 7, 7, 10 +; CHECK-NEXT: or 7, 8, 7 +; CHECK-NEXT: stw 7, 8(3) +; CHECK-NEXT: or 7, 12, 11 +; CHECK-NEXT: sraw 9, 9, 10 +; CHECK-NEXT: stw 7, 12(3) +; CHECK-NEXT: or 7, 30, 0 +; CHECK-NEXT: stw 9, 0(3) +; CHECK-NEXT: stw 7, 4(3) ; CHECK-NEXT: bdnz .LBB1_1 ; CHECK-NEXT: # %bb.2: # %for.end -; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload -; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; CHECK-NEXT: addi 1, 1, 64 +; CHECK-NEXT: lwz 30, 40(1) # 4-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 48 ; CHECK-NEXT: blr entry: br label %for.body @@ -159,59 +147,53 @@ define void @foo3(ptr %a, ptr readonly %b, ptr readonly %c) #0 { ; CHECK-LABEL: foo3: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: stwu 1, -64(1) -; CHECK-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; CHECK-NEXT: li 8, 2048 ; CHECK-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: li 7, 2048 ; CHECK-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; CHECK-NEXT: li 7, 7 -; CHECK-NEXT: mtctr 8 -; CHECK-NEXT: addi 8, 1, 32 +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: mtctr 7 +; CHECK-NEXT: addi 7, 1, 32 ; CHECK-NEXT: .LBB2_1: # %for.body ; CHECK-NEXT: # -; CHECK-NEXT: lwz 10, 4(4) -; CHECK-NEXT: lwz 0, 12(5) -; CHECK-NEXT: lwz 9, 0(4) -; CHECK-NEXT: lwz 11, 8(4) -; CHECK-NEXT: lwz 12, 12(4) -; CHECK-NEXT: stw 10, 36(1) -; CHECK-NEXT: rlwinm 10, 0, 29, 28, 31 +; CHECK-NEXT: lwz 8, 0(4) +; CHECK-NEXT: lwz 12, 12(5) +; CHECK-NEXT: lwz 9, 4(4) +; CHECK-NEXT: lwz 10, 8(4) +; CHECK-NEXT: lwz 11, 12(4) +; CHECK-NEXT: stw 8, 32(1) +; CHECK-NEXT: rlwinm 8, 12, 29, 28, 29 ; CHECK-NEXT: stw 6, 28(1) -; CHECK-NEXT: sub 10, 8, 10 +; CHECK-NEXT: sub 8, 7, 8 ; CHECK-NEXT: stw 6, 24(1) ; CHECK-NEXT: stw 6, 20(1) ; CHECK-NEXT: stw 6, 16(1) -; CHECK-NEXT: stw 12, 44(1) -; CHECK-NEXT: clrlwi 12, 0, 29 -; CHECK-NEXT: stw 11, 40(1) -; CHECK-NEXT: subfic 29, 12, 32 -; CHECK-NEXT: stw 9, 32(1) -; CHECK-NEXT: nand 9, 0, 7 -; CHECK-NEXT: lwz 11, 4(10) -; CHECK-NEXT: clrlwi 9, 9, 27 -; CHECK-NEXT: lwz 0, 8(10) -; CHECK-NEXT: lwz 30, 0(10) -; CHECK-NEXT: lwz 10, 12(10) -; CHECK-NEXT: srw 28, 0, 12 -; CHECK-NEXT: slw 0, 0, 29 -; CHECK-NEXT: slw 29, 30, 29 -; CHECK-NEXT: srw 10, 10, 12 -; CHECK-NEXT: srw 30, 30, 12 -; CHECK-NEXT: srw 12, 11, 12 -; CHECK-NEXT: slwi 11, 11, 1 -; CHECK-NEXT: slw 9, 11, 9 -; CHECK-NEXT: or 10, 0, 10 -; CHECK-NEXT: stw 10, 12(3) -; CHECK-NEXT: or 10, 29, 12 -; CHECK-NEXT: or 9, 28, 9 -; CHECK-NEXT: stw 30, 0(3) -; CHECK-NEXT: stw 10, 4(3) -; CHECK-NEXT: stw 9, 8(3) +; CHECK-NEXT: stw 11, 44(1) +; CHECK-NEXT: clrlwi 11, 12, 27 +; CHECK-NEXT: stw 10, 40(1) +; CHECK-NEXT: subfic 0, 11, 32 +; CHECK-NEXT: stw 9, 36(1) +; CHECK-NEXT: lwz 9, 4(8) +; CHECK-NEXT: lwz 10, 0(8) +; CHECK-NEXT: lwz 12, 12(8) +; CHECK-NEXT: srw 30, 9, 11 +; CHECK-NEXT: lwz 8, 8(8) +; CHECK-NEXT: slw 29, 10, 0 +; CHECK-NEXT: slw 9, 9, 0 +; CHECK-NEXT: srw 12, 12, 11 +; CHECK-NEXT: slw 0, 8, 0 +; CHECK-NEXT: srw 8, 8, 11 +; CHECK-NEXT: or 8, 9, 8 +; CHECK-NEXT: stw 8, 8(3) +; CHECK-NEXT: or 8, 0, 12 +; CHECK-NEXT: srw 10, 10, 11 +; CHECK-NEXT: stw 8, 12(3) +; CHECK-NEXT: or 8, 29, 30 +; CHECK-NEXT: stw 10, 0(3) +; CHECK-NEXT: stw 8, 4(3) ; CHECK-NEXT: bdnz .LBB2_1 ; CHECK-NEXT: # %bb.2: # %for.end ; CHECK-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; CHECK-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; CHECK-NEXT: lwz 28, 48(1) # 4-byte Folded Reload ; CHECK-NEXT: addi 1, 1, 64 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/pr59074.ll b/llvm/test/CodeGen/PowerPC/pr59074.ll index 3e328c6ad9f0..d3ca1139b4fd 100644 --- a/llvm/test/CodeGen/PowerPC/pr59074.ll +++ b/llvm/test/CodeGen/PowerPC/pr59074.ll @@ -32,37 +32,36 @@ define void @pr59074(ptr %0) { ; LE32-NEXT: li 7, 0 ; LE32-NEXT: li 8, 12 ; LE32-NEXT: xxswapd 0, 0 +; LE32-NEXT: rlwimi 5, 6, 0, 30, 28 ; LE32-NEXT: addi 4, 4, -12 -; LE32-NEXT: rlwinm 9, 4, 29, 28, 31 -; LE32-NEXT: stxvd2x 0, 6, 5 +; LE32-NEXT: rlwinm 9, 4, 29, 28, 29 +; LE32-NEXT: stxvd2x 0, 0, 5 ; LE32-NEXT: stw 7, 44(1) ; LE32-NEXT: stw 7, 40(1) ; LE32-NEXT: stw 7, 36(1) ; LE32-NEXT: stw 8, 16(1) +; LE32-NEXT: clrlwi 4, 4, 27 ; LE32-NEXT: lwzux 5, 9, 6 -; LE32-NEXT: li 6, 7 -; LE32-NEXT: lwz 7, 8(9) -; LE32-NEXT: nand 6, 4, 6 -; LE32-NEXT: lwz 8, 4(9) -; LE32-NEXT: clrlwi 4, 4, 29 -; LE32-NEXT: lwz 9, 12(9) -; LE32-NEXT: clrlwi 6, 6, 27 +; LE32-NEXT: lwz 6, 8(9) +; LE32-NEXT: lwz 7, 4(9) +; LE32-NEXT: lwz 8, 12(9) +; LE32-NEXT: xori 9, 4, 31 ; LE32-NEXT: subfic 11, 4, 32 ; LE32-NEXT: srw 5, 5, 4 -; LE32-NEXT: slwi 10, 7, 1 -; LE32-NEXT: srw 7, 7, 4 -; LE32-NEXT: slw 6, 10, 6 -; LE32-NEXT: srw 10, 8, 4 -; LE32-NEXT: slw 8, 8, 11 -; LE32-NEXT: slw 11, 9, 11 -; LE32-NEXT: srw 4, 9, 4 -; LE32-NEXT: or 5, 8, 5 -; LE32-NEXT: or 7, 11, 7 -; LE32-NEXT: or 6, 10, 6 +; LE32-NEXT: slwi 10, 6, 1 +; LE32-NEXT: srw 6, 6, 4 +; LE32-NEXT: slw 9, 10, 9 +; LE32-NEXT: srw 10, 7, 4 +; LE32-NEXT: slw 7, 7, 11 +; LE32-NEXT: slw 11, 8, 11 +; LE32-NEXT: srw 4, 8, 4 +; LE32-NEXT: or 5, 7, 5 +; LE32-NEXT: or 6, 11, 6 +; LE32-NEXT: or 7, 10, 9 ; LE32-NEXT: stw 4, 12(3) -; LE32-NEXT: stw 7, 8(3) +; LE32-NEXT: stw 6, 8(3) ; LE32-NEXT: stw 5, 0(3) -; LE32-NEXT: stw 6, 4(3) +; LE32-NEXT: stw 7, 4(3) ; LE32-NEXT: addi 1, 1, 80 ; LE32-NEXT: blr ; @@ -89,37 +88,33 @@ define void @pr59074(ptr %0) { ; BE32-NEXT: li 6, 12 ; BE32-NEXT: li 7, 0 ; BE32-NEXT: addi 8, 1, -48 -; BE32-NEXT: li 10, 7 ; BE32-NEXT: stxvw4x 0, 0, 5 -; BE32-NEXT: addi 4, 4, -12 ; BE32-NEXT: stw 6, -36(1) +; BE32-NEXT: addi 4, 4, -12 ; BE32-NEXT: stw 7, -40(1) ; BE32-NEXT: stw 7, -44(1) -; BE32-NEXT: rlwinm 9, 4, 29, 28, 31 ; BE32-NEXT: stw 7, -48(1) +; BE32-NEXT: rlwinm 9, 4, 29, 28, 29 +; BE32-NEXT: clrlwi 4, 4, 27 ; BE32-NEXT: sub 5, 8, 9 -; BE32-NEXT: nand 6, 4, 10 -; BE32-NEXT: clrlwi 4, 4, 29 -; BE32-NEXT: clrlwi 6, 6, 27 -; BE32-NEXT: lwz 7, 4(5) -; BE32-NEXT: lwz 8, 8(5) -; BE32-NEXT: lwz 9, 0(5) -; BE32-NEXT: lwz 5, 12(5) -; BE32-NEXT: slwi 10, 7, 1 -; BE32-NEXT: srw 11, 8, 4 -; BE32-NEXT: srw 7, 7, 4 -; BE32-NEXT: srw 5, 5, 4 -; BE32-NEXT: slw 6, 10, 6 +; BE32-NEXT: lwz 6, 4(5) +; BE32-NEXT: lwz 7, 0(5) +; BE32-NEXT: lwz 8, 12(5) +; BE32-NEXT: lwz 5, 8(5) ; BE32-NEXT: subfic 10, 4, 32 -; BE32-NEXT: srw 4, 9, 4 -; BE32-NEXT: slw 8, 8, 10 -; BE32-NEXT: slw 10, 9, 10 -; BE32-NEXT: or 6, 11, 6 -; BE32-NEXT: or 7, 10, 7 -; BE32-NEXT: or 5, 8, 5 +; BE32-NEXT: srw 9, 6, 4 +; BE32-NEXT: slw 11, 7, 10 +; BE32-NEXT: srw 8, 8, 4 +; BE32-NEXT: slw 6, 6, 10 +; BE32-NEXT: slw 10, 5, 10 +; BE32-NEXT: srw 5, 5, 4 +; BE32-NEXT: srw 4, 7, 4 +; BE32-NEXT: or 7, 11, 9 +; BE32-NEXT: or 8, 10, 8 +; BE32-NEXT: or 5, 6, 5 ; BE32-NEXT: stw 4, 0(3) -; BE32-NEXT: stw 6, 8(3) -; BE32-NEXT: stw 5, 12(3) +; BE32-NEXT: stw 5, 8(3) +; BE32-NEXT: stw 8, 12(3) ; BE32-NEXT: stw 7, 4(3) ; BE32-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll index f6fdb4ae2079..4f1b7bdc8b55 100644 --- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -233,9 +233,96 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 9, 8(3) ; LE-32BIT-NEXT: lwz 3, 12(3) ; LE-32BIT-NEXT: lwz 4, 12(4) +; LE-32BIT-NEXT: stw 6, 28(1) +; LE-32BIT-NEXT: stw 6, 24(1) +; LE-32BIT-NEXT: stw 6, 20(1) +; LE-32BIT-NEXT: stw 6, 16(1) +; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29 ; LE-32BIT-NEXT: stw 3, 44(1) ; LE-32BIT-NEXT: addi 3, 1, 32 -; LE-32BIT-NEXT: clrlwi 4, 4, 28 +; LE-32BIT-NEXT: stw 9, 40(1) +; LE-32BIT-NEXT: sub 3, 3, 6 +; LE-32BIT-NEXT: stw 8, 36(1) +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 +; LE-32BIT-NEXT: stw 7, 32(1) +; LE-32BIT-NEXT: subfic 9, 4, 32 +; LE-32BIT-NEXT: lwz 6, 4(3) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: lwz 8, 12(3) +; LE-32BIT-NEXT: srw 10, 6, 4 +; LE-32BIT-NEXT: lwz 3, 8(3) +; LE-32BIT-NEXT: slw 11, 7, 9 +; LE-32BIT-NEXT: slw 6, 6, 9 +; LE-32BIT-NEXT: srw 8, 8, 4 +; LE-32BIT-NEXT: slw 9, 3, 9 +; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: or 3, 6, 3 +; LE-32BIT-NEXT: stw 3, 8(5) +; LE-32BIT-NEXT: or 3, 9, 8 +; LE-32BIT-NEXT: srw 4, 7, 4 +; LE-32BIT-NEXT: stw 3, 12(5) +; LE-32BIT-NEXT: or 3, 11, 10 +; LE-32BIT-NEXT: stw 4, 0(5) +; LE-32BIT-NEXT: stw 3, 4(5) +; LE-32BIT-NEXT: addi 1, 1, 48 +; LE-32BIT-NEXT: blr + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = lshr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: lshr_16bytes_wordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: lwz 4, 0(4) +; LE-64BIT-NEXT: ld 6, 8(3) +; LE-64BIT-NEXT: ld 3, 0(3) +; LE-64BIT-NEXT: slwi 4, 4, 5 +; LE-64BIT-NEXT: subfic 7, 4, 64 +; LE-64BIT-NEXT: srd 3, 3, 4 +; LE-64BIT-NEXT: sld 7, 6, 7 +; LE-64BIT-NEXT: or 3, 3, 7 +; LE-64BIT-NEXT: addi 7, 4, -64 +; LE-64BIT-NEXT: srd 4, 6, 4 +; LE-64BIT-NEXT: srd 7, 6, 7 +; LE-64BIT-NEXT: std 4, 8(5) +; LE-64BIT-NEXT: or 3, 3, 7 +; LE-64BIT-NEXT: std 3, 0(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: lshr_16bytes_wordOff: +; BE: # %bb.0: +; BE-NEXT: lwz 4, 12(4) +; BE-NEXT: ld 6, 0(3) +; BE-NEXT: ld 3, 8(3) +; BE-NEXT: slwi 4, 4, 5 +; BE-NEXT: subfic 7, 4, 64 +; BE-NEXT: srd 3, 3, 4 +; BE-NEXT: sld 7, 6, 7 +; BE-NEXT: addi 8, 4, -64 +; BE-NEXT: or 3, 3, 7 +; BE-NEXT: srd 7, 6, 8 +; BE-NEXT: srd 4, 6, 4 +; BE-NEXT: or 3, 3, 7 +; BE-NEXT: std 4, 0(5) +; BE-NEXT: std 3, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: lshr_16bytes_wordOff: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -48(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: li 6, 0 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 3, 12(3) +; LE-32BIT-NEXT: lwz 4, 12(4) +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: addi 3, 1, 32 +; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29 ; LE-32BIT-NEXT: stw 6, 28(1) ; LE-32BIT-NEXT: sub 3, 3, 4 ; LE-32BIT-NEXT: stw 6, 24(1) @@ -255,12 +342,13 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: addi 1, 1, 48 ; LE-32BIT-NEXT: blr %src = load i128, ptr %src.ptr, align 1 - %byteOff = load i128, ptr %byteOff.ptr, align 1 - %bitOff = shl i128 %byteOff, 3 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 %res = lshr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } + define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-LABEL: shl_16bytes: ; LE-64BIT: # %bb.0: @@ -309,7 +397,93 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 4, 12(4) ; LE-32BIT-NEXT: stw 6, 44(1) ; LE-32BIT-NEXT: stw 6, 40(1) -; LE-32BIT-NEXT: clrlwi 4, 4, 28 +; LE-32BIT-NEXT: stw 6, 36(1) +; LE-32BIT-NEXT: stw 6, 32(1) +; LE-32BIT-NEXT: rlwinm 6, 4, 0, 28, 29 +; LE-32BIT-NEXT: stw 3, 28(1) +; LE-32BIT-NEXT: addi 3, 1, 16 +; LE-32BIT-NEXT: stw 9, 24(1) +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 +; LE-32BIT-NEXT: stw 8, 20(1) +; LE-32BIT-NEXT: subfic 8, 4, 32 +; LE-32BIT-NEXT: stw 7, 16(1) +; LE-32BIT-NEXT: lwzux 3, 6, 3 +; LE-32BIT-NEXT: lwz 9, 4(6) +; LE-32BIT-NEXT: slw 3, 3, 4 +; LE-32BIT-NEXT: lwz 7, 8(6) +; LE-32BIT-NEXT: lwz 6, 12(6) +; LE-32BIT-NEXT: slw 11, 9, 4 +; LE-32BIT-NEXT: srw 9, 9, 8 +; LE-32BIT-NEXT: srw 10, 7, 8 +; LE-32BIT-NEXT: srw 8, 6, 8 +; LE-32BIT-NEXT: slw 7, 7, 4 +; LE-32BIT-NEXT: slw 4, 6, 4 +; LE-32BIT-NEXT: or 3, 3, 9 +; LE-32BIT-NEXT: stw 4, 12(5) +; LE-32BIT-NEXT: or 4, 7, 8 +; LE-32BIT-NEXT: stw 3, 0(5) +; LE-32BIT-NEXT: or 3, 11, 10 +; LE-32BIT-NEXT: stw 4, 8(5) +; LE-32BIT-NEXT: stw 3, 4(5) +; LE-32BIT-NEXT: addi 1, 1, 48 +; LE-32BIT-NEXT: blr + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = shl i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: shl_16bytes_wordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: lwz 4, 0(4) +; LE-64BIT-NEXT: ld 6, 0(3) +; LE-64BIT-NEXT: ld 3, 8(3) +; LE-64BIT-NEXT: slwi 4, 4, 5 +; LE-64BIT-NEXT: subfic 7, 4, 64 +; LE-64BIT-NEXT: sld 3, 3, 4 +; LE-64BIT-NEXT: srd 7, 6, 7 +; LE-64BIT-NEXT: or 3, 3, 7 +; LE-64BIT-NEXT: addi 7, 4, -64 +; LE-64BIT-NEXT: sld 4, 6, 4 +; LE-64BIT-NEXT: sld 7, 6, 7 +; LE-64BIT-NEXT: std 4, 0(5) +; LE-64BIT-NEXT: or 3, 3, 7 +; LE-64BIT-NEXT: std 3, 8(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: shl_16bytes_wordOff: +; BE: # %bb.0: +; BE-NEXT: lwz 4, 12(4) +; BE-NEXT: ld 6, 8(3) +; BE-NEXT: ld 3, 0(3) +; BE-NEXT: slwi 4, 4, 5 +; BE-NEXT: subfic 7, 4, 64 +; BE-NEXT: sld 3, 3, 4 +; BE-NEXT: srd 7, 6, 7 +; BE-NEXT: addi 8, 4, -64 +; BE-NEXT: or 3, 3, 7 +; BE-NEXT: sld 7, 6, 8 +; BE-NEXT: sld 4, 6, 4 +; BE-NEXT: or 3, 3, 7 +; BE-NEXT: std 4, 8(5) +; BE-NEXT: std 3, 0(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: shl_16bytes_wordOff: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -48(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: li 6, 0 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 3, 12(3) +; LE-32BIT-NEXT: lwz 4, 12(4) +; LE-32BIT-NEXT: stw 6, 44(1) +; LE-32BIT-NEXT: stw 6, 40(1) +; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29 ; LE-32BIT-NEXT: stw 6, 36(1) ; LE-32BIT-NEXT: stw 6, 32(1) ; LE-32BIT-NEXT: stw 3, 28(1) @@ -328,12 +502,13 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: addi 1, 1, 48 ; LE-32BIT-NEXT: blr %src = load i128, ptr %src.ptr, align 1 - %byteOff = load i128, ptr %byteOff.ptr, align 1 - %bitOff = shl i128 %byteOff, 3 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 %res = shl i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } + define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-LABEL: ashr_16bytes: ; LE-64BIT: # %bb.0: @@ -361,17 +536,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; BE-NEXT: slwi 4, 4, 3 ; BE-NEXT: addi 7, 4, -64 ; BE-NEXT: cmpwi 7, 1 -; BE-NEXT: blt 0, .LBB8_2 +; BE-NEXT: blt 0, .LBB10_2 ; BE-NEXT: # %bb.1: ; BE-NEXT: srad 3, 6, 7 -; BE-NEXT: b .LBB8_3 -; BE-NEXT: .LBB8_2: +; BE-NEXT: b .LBB10_3 +; BE-NEXT: .LBB10_2: ; BE-NEXT: ld 3, 8(3) ; BE-NEXT: subfic 7, 4, 64 ; BE-NEXT: sld 7, 6, 7 ; BE-NEXT: srd 3, 3, 4 ; BE-NEXT: or 3, 3, 7 -; BE-NEXT: .LBB8_3: +; BE-NEXT: .LBB10_3: ; BE-NEXT: srad 4, 6, 4 ; BE-NEXT: std 3, 8(5) ; BE-NEXT: std 4, 0(5) @@ -388,7 +563,100 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 4, 12(4) ; LE-32BIT-NEXT: stw 3, 44(1) ; LE-32BIT-NEXT: srawi 3, 7, 31 -; LE-32BIT-NEXT: clrlwi 4, 4, 28 +; LE-32BIT-NEXT: stw 7, 32(1) +; LE-32BIT-NEXT: rlwinm 7, 4, 0, 28, 29 +; LE-32BIT-NEXT: stw 9, 40(1) +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 +; LE-32BIT-NEXT: stw 8, 36(1) +; LE-32BIT-NEXT: subfic 9, 4, 32 +; LE-32BIT-NEXT: stw 3, 28(1) +; LE-32BIT-NEXT: stw 3, 24(1) +; LE-32BIT-NEXT: stw 3, 20(1) +; LE-32BIT-NEXT: stw 3, 16(1) +; LE-32BIT-NEXT: sub 3, 6, 7 +; LE-32BIT-NEXT: lwz 6, 4(3) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: lwz 8, 12(3) +; LE-32BIT-NEXT: srw 10, 6, 4 +; LE-32BIT-NEXT: lwz 3, 8(3) +; LE-32BIT-NEXT: slw 11, 7, 9 +; LE-32BIT-NEXT: slw 6, 6, 9 +; LE-32BIT-NEXT: srw 8, 8, 4 +; LE-32BIT-NEXT: slw 9, 3, 9 +; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: or 3, 6, 3 +; LE-32BIT-NEXT: stw 3, 8(5) +; LE-32BIT-NEXT: or 3, 9, 8 +; LE-32BIT-NEXT: sraw 4, 7, 4 +; LE-32BIT-NEXT: stw 3, 12(5) +; LE-32BIT-NEXT: or 3, 11, 10 +; LE-32BIT-NEXT: stw 4, 0(5) +; LE-32BIT-NEXT: stw 3, 4(5) +; LE-32BIT-NEXT: addi 1, 1, 48 +; LE-32BIT-NEXT: blr + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = ashr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: ashr_16bytes_wordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: lwz 4, 0(4) +; LE-64BIT-NEXT: ld 6, 8(3) +; LE-64BIT-NEXT: ld 3, 0(3) +; LE-64BIT-NEXT: slwi 4, 4, 5 +; LE-64BIT-NEXT: subfic 7, 4, 64 +; LE-64BIT-NEXT: srd 3, 3, 4 +; LE-64BIT-NEXT: sld 7, 6, 7 +; LE-64BIT-NEXT: or 3, 3, 7 +; LE-64BIT-NEXT: addi 7, 4, -64 +; LE-64BIT-NEXT: srad 4, 6, 4 +; LE-64BIT-NEXT: cmpwi 7, 1 +; LE-64BIT-NEXT: srad 8, 6, 7 +; LE-64BIT-NEXT: std 4, 8(5) +; LE-64BIT-NEXT: isellt 3, 3, 8 +; LE-64BIT-NEXT: std 3, 0(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: ashr_16bytes_wordOff: +; BE: # %bb.0: +; BE-NEXT: lwz 4, 12(4) +; BE-NEXT: ld 6, 0(3) +; BE-NEXT: slwi 4, 4, 5 +; BE-NEXT: addi 7, 4, -64 +; BE-NEXT: cmpwi 7, 1 +; BE-NEXT: blt 0, .LBB11_2 +; BE-NEXT: # %bb.1: +; BE-NEXT: srad 3, 6, 7 +; BE-NEXT: b .LBB11_3 +; BE-NEXT: .LBB11_2: +; BE-NEXT: ld 3, 8(3) +; BE-NEXT: subfic 7, 4, 64 +; BE-NEXT: sld 7, 6, 7 +; BE-NEXT: srd 3, 3, 4 +; BE-NEXT: or 3, 3, 7 +; BE-NEXT: .LBB11_3: +; BE-NEXT: srad 4, 6, 4 +; BE-NEXT: std 3, 8(5) +; BE-NEXT: std 4, 0(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: ashr_16bytes_wordOff: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -48(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: addi 6, 1, 32 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 3, 12(3) +; LE-32BIT-NEXT: lwz 4, 12(4) +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: srawi 3, 7, 31 +; LE-32BIT-NEXT: rlwinm 4, 4, 2, 28, 29 ; LE-32BIT-NEXT: stw 9, 40(1) ; LE-32BIT-NEXT: stw 8, 36(1) ; LE-32BIT-NEXT: stw 7, 32(1) @@ -408,8 +676,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: addi 1, 1, 48 ; LE-32BIT-NEXT: blr %src = load i128, ptr %src.ptr, align 1 - %byteOff = load i128, ptr %byteOff.ptr, align 1 - %bitOff = shl i128 %byteOff, 3 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 %res = ashr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void @@ -422,12 +690,324 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-NEXT: lxvd2x 1, 0, 3 ; LE-64BIT-NEXT: xxlxor 2, 2, 2 ; LE-64BIT-NEXT: addi 7, 1, -64 +; LE-64BIT-NEXT: li 8, 32 +; LE-64BIT-NEXT: lxvd2x 0, 3, 6 +; LE-64BIT-NEXT: lwz 3, 0(4) +; LE-64BIT-NEXT: li 4, 48 +; LE-64BIT-NEXT: stxvd2x 2, 7, 4 +; LE-64BIT-NEXT: stxvd2x 2, 7, 8 +; LE-64BIT-NEXT: rlwinm 4, 3, 0, 27, 28 +; LE-64BIT-NEXT: rlwinm 3, 3, 3, 26, 28 +; LE-64BIT-NEXT: stxvd2x 0, 7, 6 +; LE-64BIT-NEXT: stxvd2x 1, 0, 7 +; LE-64BIT-NEXT: ldux 6, 4, 7 +; LE-64BIT-NEXT: subfic 7, 3, 64 +; LE-64BIT-NEXT: ld 8, 8(4) +; LE-64BIT-NEXT: ld 9, 16(4) +; LE-64BIT-NEXT: ld 4, 24(4) +; LE-64BIT-NEXT: srd 6, 6, 3 +; LE-64BIT-NEXT: sld 10, 8, 7 +; LE-64BIT-NEXT: sld 11, 4, 7 +; LE-64BIT-NEXT: srd 8, 8, 3 +; LE-64BIT-NEXT: sld 7, 9, 7 +; LE-64BIT-NEXT: or 6, 10, 6 +; LE-64BIT-NEXT: srd 10, 9, 3 +; LE-64BIT-NEXT: srd 3, 4, 3 +; LE-64BIT-NEXT: or 7, 7, 8 +; LE-64BIT-NEXT: std 3, 24(5) +; LE-64BIT-NEXT: or 3, 11, 10 +; LE-64BIT-NEXT: std 7, 8(5) +; LE-64BIT-NEXT: std 6, 0(5) +; LE-64BIT-NEXT: std 3, 16(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: lshr_32bytes: +; BE: # %bb.0: +; BE-NEXT: ld 6, 0(3) +; BE-NEXT: ld 7, 8(3) +; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: lwz 4, 28(4) +; BE-NEXT: li 9, 0 +; BE-NEXT: addi 10, 1, -32 +; BE-NEXT: std 9, -40(1) +; BE-NEXT: std 9, -48(1) +; BE-NEXT: std 9, -56(1) +; BE-NEXT: std 9, -64(1) +; BE-NEXT: std 3, -8(1) +; BE-NEXT: rlwinm 3, 4, 0, 27, 28 +; BE-NEXT: neg 3, 3 +; BE-NEXT: std 8, -16(1) +; BE-NEXT: std 7, -24(1) +; BE-NEXT: std 6, -32(1) +; BE-NEXT: extsw 3, 3 +; BE-NEXT: ldux 3, 10, 3 +; BE-NEXT: rlwinm 4, 4, 3, 26, 28 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 6, 8(10) +; BE-NEXT: ld 7, 24(10) +; BE-NEXT: ld 8, 16(10) +; BE-NEXT: sld 10, 3, 9 +; BE-NEXT: srd 3, 3, 4 +; BE-NEXT: std 3, 0(5) +; BE-NEXT: srd 11, 6, 4 +; BE-NEXT: srd 7, 7, 4 +; BE-NEXT: sld 6, 6, 9 +; BE-NEXT: sld 9, 8, 9 +; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: or 10, 10, 11 +; BE-NEXT: or 7, 9, 7 +; BE-NEXT: or 6, 6, 8 +; BE-NEXT: std 6, 16(5) +; BE-NEXT: std 7, 24(5) +; BE-NEXT: std 10, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: lshr_32bytes: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -112(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: li 6, 0 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 10, 12(3) +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: lwz 12, 20(3) +; LE-32BIT-NEXT: lwz 0, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: lwz 4, 28(4) +; LE-32BIT-NEXT: stw 6, 44(1) +; LE-32BIT-NEXT: stw 6, 40(1) +; LE-32BIT-NEXT: stw 6, 36(1) +; LE-32BIT-NEXT: stw 6, 32(1) +; LE-32BIT-NEXT: stw 6, 28(1) +; LE-32BIT-NEXT: stw 6, 24(1) +; LE-32BIT-NEXT: stw 6, 20(1) +; LE-32BIT-NEXT: stw 6, 16(1) +; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29 +; LE-32BIT-NEXT: stw 3, 76(1) +; LE-32BIT-NEXT: addi 3, 1, 48 +; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: sub 3, 3, 6 +; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 +; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 0, 72(1) +; LE-32BIT-NEXT: subfic 0, 4, 32 +; LE-32BIT-NEXT: stw 12, 68(1) +; LE-32BIT-NEXT: stw 11, 64(1) +; LE-32BIT-NEXT: stw 10, 60(1) +; LE-32BIT-NEXT: stw 9, 56(1) +; LE-32BIT-NEXT: stw 8, 52(1) +; LE-32BIT-NEXT: stw 7, 48(1) +; LE-32BIT-NEXT: lwz 6, 4(3) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: lwz 8, 12(3) +; LE-32BIT-NEXT: srw 30, 6, 4 +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: slw 29, 7, 0 +; LE-32BIT-NEXT: lwz 10, 20(3) +; LE-32BIT-NEXT: srw 28, 8, 4 +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: slw 27, 9, 0 +; LE-32BIT-NEXT: lwz 12, 28(3) +; LE-32BIT-NEXT: slw 6, 6, 0 +; LE-32BIT-NEXT: lwz 3, 24(3) +; LE-32BIT-NEXT: srw 26, 10, 4 +; LE-32BIT-NEXT: slw 25, 11, 0 +; LE-32BIT-NEXT: slw 8, 8, 0 +; LE-32BIT-NEXT: slw 10, 10, 0 +; LE-32BIT-NEXT: slw 0, 3, 0 +; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: srw 12, 12, 4 +; LE-32BIT-NEXT: or 3, 10, 3 +; LE-32BIT-NEXT: srw 11, 11, 4 +; LE-32BIT-NEXT: stw 3, 24(5) +; LE-32BIT-NEXT: or 3, 0, 12 +; LE-32BIT-NEXT: stw 3, 28(5) +; LE-32BIT-NEXT: or 3, 8, 11 +; LE-32BIT-NEXT: srw 9, 9, 4 +; LE-32BIT-NEXT: stw 3, 16(5) +; LE-32BIT-NEXT: or 3, 25, 26 +; LE-32BIT-NEXT: stw 3, 20(5) +; LE-32BIT-NEXT: or 3, 6, 9 +; LE-32BIT-NEXT: stw 3, 8(5) +; LE-32BIT-NEXT: or 3, 27, 28 +; LE-32BIT-NEXT: srw 4, 7, 4 +; LE-32BIT-NEXT: stw 3, 12(5) +; LE-32BIT-NEXT: or 3, 29, 30 +; LE-32BIT-NEXT: stw 4, 0(5) +; LE-32BIT-NEXT: stw 3, 4(5) +; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: addi 1, 1, 112 +; LE-32BIT-NEXT: blr + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: lshr_32bytes_wordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: li 6, 16 +; LE-64BIT-NEXT: lxvd2x 1, 0, 3 +; LE-64BIT-NEXT: xxlxor 2, 2, 2 +; LE-64BIT-NEXT: addi 7, 1, -64 +; LE-64BIT-NEXT: li 8, 32 +; LE-64BIT-NEXT: lxvd2x 0, 3, 6 +; LE-64BIT-NEXT: lwz 3, 0(4) +; LE-64BIT-NEXT: li 4, 48 +; LE-64BIT-NEXT: stxvd2x 2, 7, 4 +; LE-64BIT-NEXT: stxvd2x 2, 7, 8 +; LE-64BIT-NEXT: rlwinm 4, 3, 2, 27, 28 +; LE-64BIT-NEXT: rlwinm 3, 3, 5, 26, 26 +; LE-64BIT-NEXT: stxvd2x 0, 7, 6 +; LE-64BIT-NEXT: stxvd2x 1, 0, 7 +; LE-64BIT-NEXT: ldux 6, 4, 7 +; LE-64BIT-NEXT: subfic 7, 3, 64 +; LE-64BIT-NEXT: ld 8, 8(4) +; LE-64BIT-NEXT: ld 9, 16(4) +; LE-64BIT-NEXT: ld 4, 24(4) +; LE-64BIT-NEXT: srd 6, 6, 3 +; LE-64BIT-NEXT: sld 10, 8, 7 +; LE-64BIT-NEXT: sld 11, 4, 7 +; LE-64BIT-NEXT: srd 8, 8, 3 +; LE-64BIT-NEXT: sld 7, 9, 7 +; LE-64BIT-NEXT: or 6, 10, 6 +; LE-64BIT-NEXT: srd 10, 9, 3 +; LE-64BIT-NEXT: srd 3, 4, 3 +; LE-64BIT-NEXT: or 7, 7, 8 +; LE-64BIT-NEXT: std 3, 24(5) +; LE-64BIT-NEXT: or 3, 11, 10 +; LE-64BIT-NEXT: std 7, 8(5) +; LE-64BIT-NEXT: std 6, 0(5) +; LE-64BIT-NEXT: std 3, 16(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: lshr_32bytes_wordOff: +; BE: # %bb.0: +; BE-NEXT: ld 6, 0(3) +; BE-NEXT: ld 7, 8(3) +; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: lwz 4, 28(4) +; BE-NEXT: li 9, 0 +; BE-NEXT: addi 10, 1, -32 +; BE-NEXT: std 9, -40(1) +; BE-NEXT: std 9, -48(1) +; BE-NEXT: std 9, -56(1) +; BE-NEXT: std 9, -64(1) +; BE-NEXT: std 3, -8(1) +; BE-NEXT: rlwinm 3, 4, 2, 27, 28 +; BE-NEXT: neg 3, 3 +; BE-NEXT: std 8, -16(1) +; BE-NEXT: std 7, -24(1) +; BE-NEXT: std 6, -32(1) +; BE-NEXT: extsw 3, 3 +; BE-NEXT: ldux 3, 10, 3 +; BE-NEXT: rlwinm 4, 4, 5, 26, 26 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 6, 8(10) +; BE-NEXT: ld 7, 24(10) +; BE-NEXT: ld 8, 16(10) +; BE-NEXT: sld 10, 3, 9 +; BE-NEXT: srd 3, 3, 4 +; BE-NEXT: std 3, 0(5) +; BE-NEXT: srd 11, 6, 4 +; BE-NEXT: srd 7, 7, 4 +; BE-NEXT: sld 6, 6, 9 +; BE-NEXT: sld 9, 8, 9 +; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: or 10, 10, 11 +; BE-NEXT: or 7, 9, 7 +; BE-NEXT: or 6, 6, 8 +; BE-NEXT: std 6, 16(5) +; BE-NEXT: std 7, 24(5) +; BE-NEXT: std 10, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: lshr_32bytes_wordOff: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -80(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: li 6, 0 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 10, 12(3) +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: lwz 12, 20(3) +; LE-32BIT-NEXT: lwz 0, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: lwz 4, 28(4) +; LE-32BIT-NEXT: stw 3, 76(1) +; LE-32BIT-NEXT: addi 3, 1, 48 +; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29 +; LE-32BIT-NEXT: stw 6, 44(1) +; LE-32BIT-NEXT: sub 3, 3, 4 +; LE-32BIT-NEXT: stw 6, 40(1) +; LE-32BIT-NEXT: stw 6, 36(1) +; LE-32BIT-NEXT: stw 6, 32(1) +; LE-32BIT-NEXT: stw 6, 28(1) +; LE-32BIT-NEXT: stw 6, 24(1) +; LE-32BIT-NEXT: stw 6, 20(1) +; LE-32BIT-NEXT: stw 6, 16(1) +; LE-32BIT-NEXT: stw 0, 72(1) +; LE-32BIT-NEXT: stw 12, 68(1) +; LE-32BIT-NEXT: stw 11, 64(1) +; LE-32BIT-NEXT: stw 10, 60(1) +; LE-32BIT-NEXT: stw 9, 56(1) +; LE-32BIT-NEXT: stw 8, 52(1) +; LE-32BIT-NEXT: stw 7, 48(1) +; LE-32BIT-NEXT: lwz 4, 4(3) +; LE-32BIT-NEXT: lwz 6, 0(3) +; LE-32BIT-NEXT: lwz 7, 12(3) +; LE-32BIT-NEXT: lwz 8, 8(3) +; LE-32BIT-NEXT: lwz 9, 20(3) +; LE-32BIT-NEXT: lwz 10, 16(3) +; LE-32BIT-NEXT: lwz 11, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: stw 11, 24(5) +; LE-32BIT-NEXT: stw 3, 28(5) +; LE-32BIT-NEXT: stw 10, 16(5) +; LE-32BIT-NEXT: stw 9, 20(5) +; LE-32BIT-NEXT: stw 8, 8(5) +; LE-32BIT-NEXT: stw 7, 12(5) +; LE-32BIT-NEXT: stw 6, 0(5) +; LE-32BIT-NEXT: stw 4, 4(5) +; LE-32BIT-NEXT: addi 1, 1, 80 +; LE-32BIT-NEXT: blr + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: lshr_32bytes_dwordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: li 6, 16 +; LE-64BIT-NEXT: lxvd2x 1, 0, 3 +; LE-64BIT-NEXT: xxlxor 2, 2, 2 +; LE-64BIT-NEXT: addi 7, 1, -64 ; LE-64BIT-NEXT: lxvd2x 0, 3, 6 ; LE-64BIT-NEXT: lwz 3, 0(4) ; LE-64BIT-NEXT: li 4, 48 ; LE-64BIT-NEXT: stxvd2x 2, 7, 4 ; LE-64BIT-NEXT: li 4, 32 -; LE-64BIT-NEXT: clrldi 3, 3, 59 +; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28 ; LE-64BIT-NEXT: stxvd2x 2, 7, 4 ; LE-64BIT-NEXT: stxvd2x 0, 7, 6 ; LE-64BIT-NEXT: stxvd2x 1, 0, 7 @@ -438,25 +1018,24 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-NEXT: stxvd2x 0, 0, 5 ; LE-64BIT-NEXT: blr ; -; BE-LABEL: lshr_32bytes: +; BE-LABEL: lshr_32bytes_dwordOff: ; BE: # %bb.0: -; BE-NEXT: ld 6, 0(3) -; BE-NEXT: ld 7, 8(3) -; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 7, 0(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 9, 16(3) ; BE-NEXT: ld 3, 24(3) ; BE-NEXT: lwz 4, 28(4) -; BE-NEXT: addi 9, 1, -64 -; BE-NEXT: li 10, 0 -; BE-NEXT: std 10, 24(9) -; BE-NEXT: std 10, 16(9) -; BE-NEXT: std 10, 8(9) -; BE-NEXT: std 10, -64(1) -; BE-NEXT: std 3, 56(9) -; BE-NEXT: clrlwi 3, 4, 27 +; BE-NEXT: li 6, 0 +; BE-NEXT: std 6, -40(1) +; BE-NEXT: std 6, -48(1) +; BE-NEXT: std 6, -56(1) +; BE-NEXT: std 6, -64(1) +; BE-NEXT: std 3, -8(1) +; BE-NEXT: rlwinm 3, 4, 3, 27, 28 ; BE-NEXT: neg 3, 3 -; BE-NEXT: std 8, 48(9) -; BE-NEXT: std 7, 40(9) -; BE-NEXT: std 6, 32(9) +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 8, -24(1) +; BE-NEXT: std 7, -32(1) ; BE-NEXT: extsw 3, 3 ; BE-NEXT: addi 4, 1, -32 ; BE-NEXT: ldux 3, 4, 3 @@ -469,7 +1048,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; BE-NEXT: std 6, 8(5) ; BE-NEXT: blr ; -; LE-32BIT-LABEL: lshr_32bytes: +; LE-32BIT-LABEL: lshr_32bytes_dwordOff: ; LE-32BIT: # %bb.0: ; LE-32BIT-NEXT: stwu 1, -80(1) ; LE-32BIT-NEXT: lwz 7, 0(3) @@ -484,7 +1063,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 4, 28(4) ; LE-32BIT-NEXT: stw 3, 76(1) ; LE-32BIT-NEXT: addi 3, 1, 48 -; LE-32BIT-NEXT: clrlwi 4, 4, 27 +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 ; LE-32BIT-NEXT: stw 6, 44(1) ; LE-32BIT-NEXT: sub 3, 3, 4 ; LE-32BIT-NEXT: stw 6, 40(1) @@ -520,16 +1099,329 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: addi 1, 1, 80 ; LE-32BIT-NEXT: blr %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 %res = lshr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-LABEL: shl_32bytes: ; LE-64BIT: # %bb.0: ; LE-64BIT-NEXT: li 6, 16 +; LE-64BIT-NEXT: lwz 4, 0(4) +; LE-64BIT-NEXT: xxlxor 2, 2, 2 +; LE-64BIT-NEXT: addi 7, 1, -64 +; LE-64BIT-NEXT: lxvd2x 1, 0, 3 +; LE-64BIT-NEXT: addi 8, 1, -32 +; LE-64BIT-NEXT: lxvd2x 0, 3, 6 +; LE-64BIT-NEXT: stxvd2x 2, 7, 6 +; LE-64BIT-NEXT: li 6, 48 +; LE-64BIT-NEXT: rlwinm 3, 4, 0, 27, 28 +; LE-64BIT-NEXT: rlwinm 4, 4, 3, 26, 28 +; LE-64BIT-NEXT: neg 3, 3 +; LE-64BIT-NEXT: stxvd2x 0, 7, 6 +; LE-64BIT-NEXT: li 6, 32 +; LE-64BIT-NEXT: extsw 3, 3 +; LE-64BIT-NEXT: stxvd2x 1, 7, 6 +; LE-64BIT-NEXT: stxvd2x 2, 0, 7 +; LE-64BIT-NEXT: subfic 6, 4, 64 +; LE-64BIT-NEXT: ldux 3, 8, 3 +; LE-64BIT-NEXT: ld 7, 16(8) +; LE-64BIT-NEXT: ld 9, 24(8) +; LE-64BIT-NEXT: ld 8, 8(8) +; LE-64BIT-NEXT: srd 10, 7, 6 +; LE-64BIT-NEXT: sld 9, 9, 4 +; LE-64BIT-NEXT: sld 7, 7, 4 +; LE-64BIT-NEXT: or 9, 9, 10 +; LE-64BIT-NEXT: srd 10, 8, 6 +; LE-64BIT-NEXT: srd 6, 3, 6 +; LE-64BIT-NEXT: sld 8, 8, 4 +; LE-64BIT-NEXT: sld 3, 3, 4 +; LE-64BIT-NEXT: or 6, 8, 6 +; LE-64BIT-NEXT: std 3, 0(5) +; LE-64BIT-NEXT: or 3, 7, 10 +; LE-64BIT-NEXT: std 9, 24(5) +; LE-64BIT-NEXT: std 6, 8(5) +; LE-64BIT-NEXT: std 3, 16(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: shl_32bytes: +; BE: # %bb.0: +; BE-NEXT: ld 6, 0(3) +; BE-NEXT: ld 7, 8(3) +; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: lwz 4, 28(4) +; BE-NEXT: li 9, 0 +; BE-NEXT: addi 10, 1, -64 +; BE-NEXT: std 9, -8(1) +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 9, -24(1) +; BE-NEXT: std 9, -32(1) +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 8, -48(1) +; BE-NEXT: std 7, -56(1) +; BE-NEXT: std 6, -64(1) +; BE-NEXT: rlwinm 3, 4, 0, 27, 28 +; BE-NEXT: ldux 6, 3, 10 +; BE-NEXT: rlwinm 4, 4, 3, 26, 28 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 7, 16(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: sld 6, 6, 4 +; BE-NEXT: srd 10, 7, 9 +; BE-NEXT: sld 11, 8, 4 +; BE-NEXT: srd 8, 8, 9 +; BE-NEXT: srd 9, 3, 9 +; BE-NEXT: sld 7, 7, 4 +; BE-NEXT: sld 3, 3, 4 +; BE-NEXT: or 10, 11, 10 +; BE-NEXT: or 6, 6, 8 +; BE-NEXT: or 7, 7, 9 +; BE-NEXT: std 3, 24(5) +; BE-NEXT: std 7, 16(5) +; BE-NEXT: std 6, 0(5) +; BE-NEXT: std 10, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: shl_32bytes: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -112(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: li 6, 0 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 10, 12(3) +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: lwz 12, 20(3) +; LE-32BIT-NEXT: lwz 0, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: lwz 4, 28(4) +; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 6, 76(1) +; LE-32BIT-NEXT: stw 6, 72(1) +; LE-32BIT-NEXT: stw 6, 68(1) +; LE-32BIT-NEXT: stw 6, 64(1) +; LE-32BIT-NEXT: stw 6, 60(1) +; LE-32BIT-NEXT: stw 6, 56(1) +; LE-32BIT-NEXT: stw 6, 52(1) +; LE-32BIT-NEXT: stw 6, 48(1) +; LE-32BIT-NEXT: rlwinm 6, 4, 0, 27, 29 +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: addi 3, 1, 16 +; LE-32BIT-NEXT: stw 0, 40(1) +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 +; LE-32BIT-NEXT: stw 12, 36(1) +; LE-32BIT-NEXT: subfic 12, 4, 32 +; LE-32BIT-NEXT: stw 11, 32(1) +; LE-32BIT-NEXT: stw 10, 28(1) +; LE-32BIT-NEXT: stw 9, 24(1) +; LE-32BIT-NEXT: stw 8, 20(1) +; LE-32BIT-NEXT: stw 7, 16(1) +; LE-32BIT-NEXT: lwzux 3, 6, 3 +; LE-32BIT-NEXT: lwz 7, 8(6) +; LE-32BIT-NEXT: slw 3, 3, 4 +; LE-32BIT-NEXT: lwz 8, 4(6) +; LE-32BIT-NEXT: lwz 9, 16(6) +; LE-32BIT-NEXT: srw 30, 7, 12 +; LE-32BIT-NEXT: lwz 10, 12(6) +; LE-32BIT-NEXT: slw 29, 8, 4 +; LE-32BIT-NEXT: lwz 11, 24(6) +; LE-32BIT-NEXT: srw 8, 8, 12 +; LE-32BIT-NEXT: lwz 0, 20(6) +; LE-32BIT-NEXT: srw 28, 9, 12 +; LE-32BIT-NEXT: lwz 6, 28(6) +; LE-32BIT-NEXT: slw 27, 10, 4 +; LE-32BIT-NEXT: srw 10, 10, 12 +; LE-32BIT-NEXT: slw 7, 7, 4 +; LE-32BIT-NEXT: srw 26, 11, 12 +; LE-32BIT-NEXT: slw 25, 0, 4 +; LE-32BIT-NEXT: srw 0, 0, 12 +; LE-32BIT-NEXT: slw 9, 9, 4 +; LE-32BIT-NEXT: srw 12, 6, 12 +; LE-32BIT-NEXT: slw 11, 11, 4 +; LE-32BIT-NEXT: slw 4, 6, 4 +; LE-32BIT-NEXT: stw 4, 28(5) +; LE-32BIT-NEXT: or 4, 11, 12 +; LE-32BIT-NEXT: stw 4, 24(5) +; LE-32BIT-NEXT: or 4, 9, 0 +; LE-32BIT-NEXT: stw 4, 16(5) +; LE-32BIT-NEXT: or 4, 25, 26 +; LE-32BIT-NEXT: stw 4, 20(5) +; LE-32BIT-NEXT: or 4, 7, 10 +; LE-32BIT-NEXT: or 3, 3, 8 +; LE-32BIT-NEXT: stw 4, 8(5) +; LE-32BIT-NEXT: or 4, 27, 28 +; LE-32BIT-NEXT: stw 3, 0(5) +; LE-32BIT-NEXT: or 3, 29, 30 +; LE-32BIT-NEXT: stw 4, 12(5) +; LE-32BIT-NEXT: stw 3, 4(5) +; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: addi 1, 1, 112 +; LE-32BIT-NEXT: blr + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: shl_32bytes_wordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: li 6, 16 +; LE-64BIT-NEXT: lwz 4, 0(4) +; LE-64BIT-NEXT: xxlxor 2, 2, 2 +; LE-64BIT-NEXT: addi 7, 1, -64 +; LE-64BIT-NEXT: lxvd2x 1, 0, 3 +; LE-64BIT-NEXT: addi 8, 1, -32 +; LE-64BIT-NEXT: lxvd2x 0, 3, 6 +; LE-64BIT-NEXT: stxvd2x 2, 7, 6 +; LE-64BIT-NEXT: li 6, 48 +; LE-64BIT-NEXT: rlwinm 3, 4, 2, 27, 28 +; LE-64BIT-NEXT: rlwinm 4, 4, 5, 26, 26 +; LE-64BIT-NEXT: neg 3, 3 +; LE-64BIT-NEXT: stxvd2x 0, 7, 6 +; LE-64BIT-NEXT: li 6, 32 +; LE-64BIT-NEXT: extsw 3, 3 +; LE-64BIT-NEXT: stxvd2x 1, 7, 6 +; LE-64BIT-NEXT: stxvd2x 2, 0, 7 +; LE-64BIT-NEXT: subfic 6, 4, 64 +; LE-64BIT-NEXT: ldux 3, 8, 3 +; LE-64BIT-NEXT: ld 7, 16(8) +; LE-64BIT-NEXT: ld 9, 24(8) +; LE-64BIT-NEXT: ld 8, 8(8) +; LE-64BIT-NEXT: srd 10, 7, 6 +; LE-64BIT-NEXT: sld 9, 9, 4 +; LE-64BIT-NEXT: sld 7, 7, 4 +; LE-64BIT-NEXT: or 9, 9, 10 +; LE-64BIT-NEXT: srd 10, 8, 6 +; LE-64BIT-NEXT: srd 6, 3, 6 +; LE-64BIT-NEXT: sld 8, 8, 4 +; LE-64BIT-NEXT: sld 3, 3, 4 +; LE-64BIT-NEXT: or 6, 8, 6 +; LE-64BIT-NEXT: std 3, 0(5) +; LE-64BIT-NEXT: or 3, 7, 10 +; LE-64BIT-NEXT: std 9, 24(5) +; LE-64BIT-NEXT: std 6, 8(5) +; LE-64BIT-NEXT: std 3, 16(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: shl_32bytes_wordOff: +; BE: # %bb.0: +; BE-NEXT: ld 6, 0(3) +; BE-NEXT: ld 7, 8(3) +; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: lwz 4, 28(4) +; BE-NEXT: li 9, 0 +; BE-NEXT: addi 10, 1, -64 +; BE-NEXT: std 9, -8(1) +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 9, -24(1) +; BE-NEXT: std 9, -32(1) +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 8, -48(1) +; BE-NEXT: std 7, -56(1) +; BE-NEXT: std 6, -64(1) +; BE-NEXT: rlwinm 3, 4, 2, 27, 28 +; BE-NEXT: ldux 6, 3, 10 +; BE-NEXT: rlwinm 4, 4, 5, 26, 26 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 7, 16(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: sld 6, 6, 4 +; BE-NEXT: srd 10, 7, 9 +; BE-NEXT: sld 11, 8, 4 +; BE-NEXT: srd 8, 8, 9 +; BE-NEXT: srd 9, 3, 9 +; BE-NEXT: sld 7, 7, 4 +; BE-NEXT: sld 3, 3, 4 +; BE-NEXT: or 10, 11, 10 +; BE-NEXT: or 6, 6, 8 +; BE-NEXT: or 7, 7, 9 +; BE-NEXT: std 3, 24(5) +; BE-NEXT: std 7, 16(5) +; BE-NEXT: std 6, 0(5) +; BE-NEXT: std 10, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: shl_32bytes_wordOff: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -80(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: li 6, 0 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 10, 12(3) +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: lwz 12, 20(3) +; LE-32BIT-NEXT: lwz 0, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: lwz 4, 28(4) +; LE-32BIT-NEXT: stw 6, 76(1) +; LE-32BIT-NEXT: stw 6, 72(1) +; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29 +; LE-32BIT-NEXT: stw 6, 68(1) +; LE-32BIT-NEXT: stw 6, 64(1) +; LE-32BIT-NEXT: stw 6, 60(1) +; LE-32BIT-NEXT: stw 6, 56(1) +; LE-32BIT-NEXT: stw 6, 52(1) +; LE-32BIT-NEXT: stw 6, 48(1) +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: addi 3, 1, 16 +; LE-32BIT-NEXT: stw 0, 40(1) +; LE-32BIT-NEXT: stw 12, 36(1) +; LE-32BIT-NEXT: stw 11, 32(1) +; LE-32BIT-NEXT: stw 10, 28(1) +; LE-32BIT-NEXT: stw 9, 24(1) +; LE-32BIT-NEXT: stw 8, 20(1) +; LE-32BIT-NEXT: stw 7, 16(1) +; LE-32BIT-NEXT: lwzux 3, 4, 3 +; LE-32BIT-NEXT: lwz 6, 4(4) +; LE-32BIT-NEXT: lwz 7, 12(4) +; LE-32BIT-NEXT: lwz 8, 8(4) +; LE-32BIT-NEXT: lwz 9, 20(4) +; LE-32BIT-NEXT: lwz 10, 16(4) +; LE-32BIT-NEXT: lwz 11, 28(4) +; LE-32BIT-NEXT: lwz 4, 24(4) +; LE-32BIT-NEXT: stw 3, 0(5) +; LE-32BIT-NEXT: stw 4, 24(5) +; LE-32BIT-NEXT: stw 11, 28(5) +; LE-32BIT-NEXT: stw 10, 16(5) +; LE-32BIT-NEXT: stw 9, 20(5) +; LE-32BIT-NEXT: stw 8, 8(5) +; LE-32BIT-NEXT: stw 7, 12(5) +; LE-32BIT-NEXT: stw 6, 4(5) +; LE-32BIT-NEXT: addi 1, 1, 80 +; LE-32BIT-NEXT: blr + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: shl_32bytes_dwordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: li 6, 16 ; LE-64BIT-NEXT: lxvd2x 1, 0, 3 ; LE-64BIT-NEXT: xxlxor 2, 2, 2 ; LE-64BIT-NEXT: li 7, 48 @@ -537,7 +1429,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-NEXT: lwz 3, 0(4) ; LE-64BIT-NEXT: addi 4, 1, -64 ; LE-64BIT-NEXT: stxvd2x 2, 4, 6 -; LE-64BIT-NEXT: clrlwi 3, 3, 27 +; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28 ; LE-64BIT-NEXT: stxvd2x 0, 4, 7 ; LE-64BIT-NEXT: li 7, 32 ; LE-64BIT-NEXT: neg 3, 3 @@ -552,25 +1444,25 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-NEXT: stxvd2x 0, 0, 5 ; LE-64BIT-NEXT: blr ; -; BE-LABEL: shl_32bytes: +; BE-LABEL: shl_32bytes_dwordOff: ; BE: # %bb.0: -; BE-NEXT: ld 6, 0(3) -; BE-NEXT: ld 7, 8(3) -; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 7, 0(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 9, 16(3) ; BE-NEXT: ld 3, 24(3) ; BE-NEXT: lwz 4, 28(4) -; BE-NEXT: addi 9, 1, -64 -; BE-NEXT: li 10, 0 -; BE-NEXT: std 10, 56(9) -; BE-NEXT: std 10, 48(9) -; BE-NEXT: std 10, 40(9) -; BE-NEXT: std 10, 32(9) -; BE-NEXT: std 3, 24(9) -; BE-NEXT: std 8, 16(9) -; BE-NEXT: std 7, 8(9) -; BE-NEXT: std 6, -64(1) -; BE-NEXT: clrldi 3, 4, 59 -; BE-NEXT: ldux 4, 3, 9 +; BE-NEXT: li 6, 0 +; BE-NEXT: std 6, -8(1) +; BE-NEXT: std 6, -16(1) +; BE-NEXT: std 6, -24(1) +; BE-NEXT: std 6, -32(1) +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 9, -48(1) +; BE-NEXT: std 8, -56(1) +; BE-NEXT: std 7, -64(1) +; BE-NEXT: rlwinm 3, 4, 3, 27, 28 +; BE-NEXT: addi 4, 1, -64 +; BE-NEXT: ldux 4, 3, 4 ; BE-NEXT: ld 6, 8(3) ; BE-NEXT: ld 7, 24(3) ; BE-NEXT: ld 3, 16(3) @@ -580,7 +1472,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; BE-NEXT: std 6, 8(5) ; BE-NEXT: blr ; -; LE-32BIT-LABEL: shl_32bytes: +; LE-32BIT-LABEL: shl_32bytes_dwordOff: ; LE-32BIT: # %bb.0: ; LE-32BIT-NEXT: stwu 1, -80(1) ; LE-32BIT-NEXT: lwz 7, 0(3) @@ -595,7 +1487,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 4, 28(4) ; LE-32BIT-NEXT: stw 6, 76(1) ; LE-32BIT-NEXT: stw 6, 72(1) -; LE-32BIT-NEXT: clrlwi 4, 4, 27 +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 ; LE-32BIT-NEXT: stw 6, 68(1) ; LE-32BIT-NEXT: stw 6, 64(1) ; LE-32BIT-NEXT: stw 6, 60(1) @@ -612,87 +1504,403 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: stw 8, 20(1) ; LE-32BIT-NEXT: stw 7, 16(1) ; LE-32BIT-NEXT: lwzux 3, 4, 3 -; LE-32BIT-NEXT: lwz 6, 4(4) -; LE-32BIT-NEXT: lwz 7, 12(4) -; LE-32BIT-NEXT: lwz 8, 8(4) -; LE-32BIT-NEXT: lwz 9, 20(4) -; LE-32BIT-NEXT: lwz 10, 16(4) -; LE-32BIT-NEXT: lwz 11, 28(4) -; LE-32BIT-NEXT: lwz 4, 24(4) +; LE-32BIT-NEXT: lwz 6, 12(4) +; LE-32BIT-NEXT: lwz 7, 8(4) +; LE-32BIT-NEXT: lwz 8, 20(4) +; LE-32BIT-NEXT: lwz 9, 16(4) +; LE-32BIT-NEXT: lwz 10, 28(4) +; LE-32BIT-NEXT: lwz 11, 24(4) +; LE-32BIT-NEXT: ori 4, 4, 4 +; LE-32BIT-NEXT: lwz 4, 0(4) ; LE-32BIT-NEXT: stw 3, 0(5) -; LE-32BIT-NEXT: stw 4, 24(5) -; LE-32BIT-NEXT: stw 11, 28(5) -; LE-32BIT-NEXT: stw 10, 16(5) -; LE-32BIT-NEXT: stw 9, 20(5) -; LE-32BIT-NEXT: stw 8, 8(5) -; LE-32BIT-NEXT: stw 7, 12(5) -; LE-32BIT-NEXT: stw 6, 4(5) +; LE-32BIT-NEXT: stw 11, 24(5) +; LE-32BIT-NEXT: stw 10, 28(5) +; LE-32BIT-NEXT: stw 9, 16(5) +; LE-32BIT-NEXT: stw 8, 20(5) +; LE-32BIT-NEXT: stw 7, 8(5) +; LE-32BIT-NEXT: stw 6, 12(5) +; LE-32BIT-NEXT: stw 4, 4(5) ; LE-32BIT-NEXT: addi 1, 1, 80 ; LE-32BIT-NEXT: blr %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 %res = shl i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + + define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-64BIT-LABEL: ashr_32bytes: ; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: ld 6, 24(3) ; LE-64BIT-NEXT: lxvd2x 0, 0, 3 -; LE-64BIT-NEXT: ld 6, 16(3) -; LE-64BIT-NEXT: ld 3, 24(3) +; LE-64BIT-NEXT: lwz 4, 0(4) ; LE-64BIT-NEXT: addi 7, 1, -64 +; LE-64BIT-NEXT: ld 3, 16(3) +; LE-64BIT-NEXT: sradi 8, 6, 63 +; LE-64BIT-NEXT: rlwinm 9, 4, 0, 27, 28 +; LE-64BIT-NEXT: stxvd2x 0, 0, 7 +; LE-64BIT-NEXT: std 6, -40(1) +; LE-64BIT-NEXT: std 3, -48(1) +; LE-64BIT-NEXT: std 8, -8(1) +; LE-64BIT-NEXT: std 8, -16(1) +; LE-64BIT-NEXT: std 8, -24(1) +; LE-64BIT-NEXT: std 8, -32(1) +; LE-64BIT-NEXT: rlwinm 3, 4, 3, 26, 28 +; LE-64BIT-NEXT: ldux 4, 9, 7 +; LE-64BIT-NEXT: ld 7, 8(9) +; LE-64BIT-NEXT: subfic 6, 3, 64 +; LE-64BIT-NEXT: ld 8, 16(9) +; LE-64BIT-NEXT: ld 9, 24(9) +; LE-64BIT-NEXT: srd 4, 4, 3 +; LE-64BIT-NEXT: sld 10, 7, 6 +; LE-64BIT-NEXT: sld 11, 9, 6 +; LE-64BIT-NEXT: srd 7, 7, 3 +; LE-64BIT-NEXT: sld 6, 8, 6 +; LE-64BIT-NEXT: or 4, 10, 4 +; LE-64BIT-NEXT: srd 10, 8, 3 +; LE-64BIT-NEXT: srad 3, 9, 3 +; LE-64BIT-NEXT: or 6, 6, 7 +; LE-64BIT-NEXT: std 3, 24(5) +; LE-64BIT-NEXT: or 3, 11, 10 +; LE-64BIT-NEXT: std 6, 8(5) +; LE-64BIT-NEXT: std 4, 0(5) +; LE-64BIT-NEXT: std 3, 16(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: ashr_32bytes: +; BE: # %bb.0: +; BE-NEXT: ld 7, 0(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 9, 16(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: lwz 4, 28(4) +; BE-NEXT: addi 6, 1, -32 +; BE-NEXT: std 3, -8(1) +; BE-NEXT: std 7, -32(1) +; BE-NEXT: sradi 3, 7, 63 +; BE-NEXT: rlwinm 7, 4, 0, 27, 28 +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 3, -48(1) +; BE-NEXT: std 3, -56(1) +; BE-NEXT: std 3, -64(1) +; BE-NEXT: neg 3, 7 +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 8, -24(1) +; BE-NEXT: extsw 3, 3 +; BE-NEXT: ldux 3, 6, 3 +; BE-NEXT: rlwinm 4, 4, 3, 26, 28 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 7, 8(6) +; BE-NEXT: ld 8, 24(6) +; BE-NEXT: ld 6, 16(6) +; BE-NEXT: sld 10, 3, 9 +; BE-NEXT: srad 3, 3, 4 +; BE-NEXT: std 3, 0(5) +; BE-NEXT: srd 11, 7, 4 +; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: sld 7, 7, 9 +; BE-NEXT: sld 9, 6, 9 +; BE-NEXT: srd 6, 6, 4 +; BE-NEXT: or 10, 10, 11 +; BE-NEXT: or 8, 9, 8 +; BE-NEXT: or 6, 7, 6 +; BE-NEXT: std 6, 16(5) +; BE-NEXT: std 8, 24(5) +; BE-NEXT: std 10, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: ashr_32bytes: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -112(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: addi 6, 1, 48 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 10, 12(3) +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: lwz 12, 20(3) +; LE-32BIT-NEXT: lwz 0, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: lwz 4, 28(4) +; LE-32BIT-NEXT: stw 3, 76(1) +; LE-32BIT-NEXT: srawi 3, 7, 31 +; LE-32BIT-NEXT: stw 7, 48(1) +; LE-32BIT-NEXT: rlwinm 7, 4, 0, 27, 29 +; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 +; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: stw 0, 72(1) +; LE-32BIT-NEXT: subfic 0, 4, 32 +; LE-32BIT-NEXT: stw 12, 68(1) +; LE-32BIT-NEXT: stw 11, 64(1) +; LE-32BIT-NEXT: stw 10, 60(1) +; LE-32BIT-NEXT: stw 9, 56(1) +; LE-32BIT-NEXT: stw 8, 52(1) +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: stw 3, 40(1) +; LE-32BIT-NEXT: stw 3, 36(1) +; LE-32BIT-NEXT: stw 3, 32(1) +; LE-32BIT-NEXT: stw 3, 28(1) +; LE-32BIT-NEXT: stw 3, 24(1) +; LE-32BIT-NEXT: stw 3, 20(1) +; LE-32BIT-NEXT: stw 3, 16(1) +; LE-32BIT-NEXT: sub 3, 6, 7 +; LE-32BIT-NEXT: lwz 6, 4(3) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: lwz 8, 12(3) +; LE-32BIT-NEXT: srw 30, 6, 4 +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: slw 29, 7, 0 +; LE-32BIT-NEXT: lwz 10, 20(3) +; LE-32BIT-NEXT: srw 28, 8, 4 +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: slw 27, 9, 0 +; LE-32BIT-NEXT: lwz 12, 28(3) +; LE-32BIT-NEXT: slw 6, 6, 0 +; LE-32BIT-NEXT: lwz 3, 24(3) +; LE-32BIT-NEXT: srw 26, 10, 4 +; LE-32BIT-NEXT: slw 25, 11, 0 +; LE-32BIT-NEXT: slw 8, 8, 0 +; LE-32BIT-NEXT: slw 10, 10, 0 +; LE-32BIT-NEXT: slw 0, 3, 0 +; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: srw 12, 12, 4 +; LE-32BIT-NEXT: or 3, 10, 3 +; LE-32BIT-NEXT: srw 11, 11, 4 +; LE-32BIT-NEXT: stw 3, 24(5) +; LE-32BIT-NEXT: or 3, 0, 12 +; LE-32BIT-NEXT: stw 3, 28(5) +; LE-32BIT-NEXT: or 3, 8, 11 +; LE-32BIT-NEXT: srw 9, 9, 4 +; LE-32BIT-NEXT: stw 3, 16(5) +; LE-32BIT-NEXT: or 3, 25, 26 +; LE-32BIT-NEXT: stw 3, 20(5) +; LE-32BIT-NEXT: or 3, 6, 9 +; LE-32BIT-NEXT: stw 3, 8(5) +; LE-32BIT-NEXT: or 3, 27, 28 +; LE-32BIT-NEXT: sraw 4, 7, 4 +; LE-32BIT-NEXT: stw 3, 12(5) +; LE-32BIT-NEXT: or 3, 29, 30 +; LE-32BIT-NEXT: stw 4, 0(5) +; LE-32BIT-NEXT: stw 3, 4(5) +; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 28, 96(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 27, 92(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 26, 88(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: lwz 25, 84(1) # 4-byte Folded Reload +; LE-32BIT-NEXT: addi 1, 1, 112 +; LE-32BIT-NEXT: blr + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: ashr_32bytes_wordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: ld 6, 24(3) +; LE-64BIT-NEXT: lxvd2x 0, 0, 3 ; LE-64BIT-NEXT: lwz 4, 0(4) -; LE-64BIT-NEXT: li 8, 16 -; LE-64BIT-NEXT: std 3, 24(7) -; LE-64BIT-NEXT: sradi 3, 3, 63 -; LE-64BIT-NEXT: std 6, 16(7) -; LE-64BIT-NEXT: std 3, 56(7) -; LE-64BIT-NEXT: std 3, 48(7) -; LE-64BIT-NEXT: std 3, 40(7) -; LE-64BIT-NEXT: std 3, 32(7) -; LE-64BIT-NEXT: clrldi 3, 4, 59 +; LE-64BIT-NEXT: addi 7, 1, -64 +; LE-64BIT-NEXT: ld 3, 16(3) +; LE-64BIT-NEXT: sradi 8, 6, 63 +; LE-64BIT-NEXT: rlwinm 9, 4, 2, 27, 28 ; LE-64BIT-NEXT: stxvd2x 0, 0, 7 -; LE-64BIT-NEXT: lxvd2x 0, 7, 3 -; LE-64BIT-NEXT: add 3, 7, 3 -; LE-64BIT-NEXT: lxvd2x 1, 3, 8 -; LE-64BIT-NEXT: stxvd2x 1, 5, 8 +; LE-64BIT-NEXT: std 6, -40(1) +; LE-64BIT-NEXT: std 3, -48(1) +; LE-64BIT-NEXT: std 8, -8(1) +; LE-64BIT-NEXT: std 8, -16(1) +; LE-64BIT-NEXT: std 8, -24(1) +; LE-64BIT-NEXT: std 8, -32(1) +; LE-64BIT-NEXT: rlwinm 3, 4, 5, 26, 26 +; LE-64BIT-NEXT: ldux 4, 9, 7 +; LE-64BIT-NEXT: ld 7, 8(9) +; LE-64BIT-NEXT: subfic 6, 3, 64 +; LE-64BIT-NEXT: ld 8, 16(9) +; LE-64BIT-NEXT: ld 9, 24(9) +; LE-64BIT-NEXT: srd 4, 4, 3 +; LE-64BIT-NEXT: sld 10, 7, 6 +; LE-64BIT-NEXT: sld 11, 9, 6 +; LE-64BIT-NEXT: srd 7, 7, 3 +; LE-64BIT-NEXT: sld 6, 8, 6 +; LE-64BIT-NEXT: or 4, 10, 4 +; LE-64BIT-NEXT: srd 10, 8, 3 +; LE-64BIT-NEXT: srad 3, 9, 3 +; LE-64BIT-NEXT: or 6, 6, 7 +; LE-64BIT-NEXT: std 3, 24(5) +; LE-64BIT-NEXT: or 3, 11, 10 +; LE-64BIT-NEXT: std 6, 8(5) +; LE-64BIT-NEXT: std 4, 0(5) +; LE-64BIT-NEXT: std 3, 16(5) +; LE-64BIT-NEXT: blr +; +; BE-LABEL: ashr_32bytes_wordOff: +; BE: # %bb.0: +; BE-NEXT: ld 7, 0(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 9, 16(3) +; BE-NEXT: ld 3, 24(3) +; BE-NEXT: lwz 4, 28(4) +; BE-NEXT: addi 6, 1, -32 +; BE-NEXT: std 3, -8(1) +; BE-NEXT: std 7, -32(1) +; BE-NEXT: sradi 3, 7, 63 +; BE-NEXT: rlwinm 7, 4, 2, 27, 28 +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 3, -48(1) +; BE-NEXT: std 3, -56(1) +; BE-NEXT: std 3, -64(1) +; BE-NEXT: neg 3, 7 +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 8, -24(1) +; BE-NEXT: extsw 3, 3 +; BE-NEXT: ldux 3, 6, 3 +; BE-NEXT: rlwinm 4, 4, 5, 26, 26 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 7, 8(6) +; BE-NEXT: ld 8, 24(6) +; BE-NEXT: ld 6, 16(6) +; BE-NEXT: sld 10, 3, 9 +; BE-NEXT: srad 3, 3, 4 +; BE-NEXT: std 3, 0(5) +; BE-NEXT: srd 11, 7, 4 +; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: sld 7, 7, 9 +; BE-NEXT: sld 9, 6, 9 +; BE-NEXT: srd 6, 6, 4 +; BE-NEXT: or 10, 10, 11 +; BE-NEXT: or 8, 9, 8 +; BE-NEXT: or 6, 7, 6 +; BE-NEXT: std 6, 16(5) +; BE-NEXT: std 8, 24(5) +; BE-NEXT: std 10, 8(5) +; BE-NEXT: blr +; +; LE-32BIT-LABEL: ashr_32bytes_wordOff: +; LE-32BIT: # %bb.0: +; LE-32BIT-NEXT: stwu 1, -80(1) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: addi 6, 1, 48 +; LE-32BIT-NEXT: lwz 8, 4(3) +; LE-32BIT-NEXT: lwz 9, 8(3) +; LE-32BIT-NEXT: lwz 10, 12(3) +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: lwz 12, 20(3) +; LE-32BIT-NEXT: lwz 0, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: lwz 4, 28(4) +; LE-32BIT-NEXT: stw 3, 76(1) +; LE-32BIT-NEXT: srawi 3, 7, 31 +; LE-32BIT-NEXT: rlwinm 4, 4, 2, 27, 29 +; LE-32BIT-NEXT: stw 0, 72(1) +; LE-32BIT-NEXT: stw 12, 68(1) +; LE-32BIT-NEXT: stw 11, 64(1) +; LE-32BIT-NEXT: stw 10, 60(1) +; LE-32BIT-NEXT: stw 9, 56(1) +; LE-32BIT-NEXT: stw 8, 52(1) +; LE-32BIT-NEXT: stw 7, 48(1) +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: stw 3, 40(1) +; LE-32BIT-NEXT: stw 3, 36(1) +; LE-32BIT-NEXT: stw 3, 32(1) +; LE-32BIT-NEXT: stw 3, 28(1) +; LE-32BIT-NEXT: stw 3, 24(1) +; LE-32BIT-NEXT: stw 3, 20(1) +; LE-32BIT-NEXT: stw 3, 16(1) +; LE-32BIT-NEXT: sub 3, 6, 4 +; LE-32BIT-NEXT: lwz 4, 4(3) +; LE-32BIT-NEXT: lwz 6, 0(3) +; LE-32BIT-NEXT: lwz 7, 12(3) +; LE-32BIT-NEXT: lwz 8, 8(3) +; LE-32BIT-NEXT: lwz 9, 20(3) +; LE-32BIT-NEXT: lwz 10, 16(3) +; LE-32BIT-NEXT: lwz 11, 24(3) +; LE-32BIT-NEXT: lwz 3, 28(3) +; LE-32BIT-NEXT: stw 11, 24(5) +; LE-32BIT-NEXT: stw 3, 28(5) +; LE-32BIT-NEXT: stw 10, 16(5) +; LE-32BIT-NEXT: stw 9, 20(5) +; LE-32BIT-NEXT: stw 8, 8(5) +; LE-32BIT-NEXT: stw 7, 12(5) +; LE-32BIT-NEXT: stw 6, 0(5) +; LE-32BIT-NEXT: stw 4, 4(5) +; LE-32BIT-NEXT: addi 1, 1, 80 +; LE-32BIT-NEXT: blr + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; LE-64BIT-LABEL: ashr_32bytes_dwordOff: +; LE-64BIT: # %bb.0: +; LE-64BIT-NEXT: lxvd2x 0, 0, 3 +; LE-64BIT-NEXT: ld 6, 16(3) +; LE-64BIT-NEXT: ld 7, 24(3) +; LE-64BIT-NEXT: lwz 3, 0(4) +; LE-64BIT-NEXT: addi 4, 1, -64 +; LE-64BIT-NEXT: rlwinm 3, 3, 3, 27, 28 +; LE-64BIT-NEXT: stxvd2x 0, 0, 4 +; LE-64BIT-NEXT: std 6, -48(1) +; LE-64BIT-NEXT: sradi 6, 7, 63 +; LE-64BIT-NEXT: std 7, -40(1) +; LE-64BIT-NEXT: std 6, -8(1) +; LE-64BIT-NEXT: std 6, -16(1) +; LE-64BIT-NEXT: std 6, -24(1) +; LE-64BIT-NEXT: std 6, -32(1) +; LE-64BIT-NEXT: lxvd2x 0, 4, 3 +; LE-64BIT-NEXT: add 3, 4, 3 +; LE-64BIT-NEXT: li 4, 16 +; LE-64BIT-NEXT: lxvd2x 1, 3, 4 +; LE-64BIT-NEXT: stxvd2x 1, 5, 4 ; LE-64BIT-NEXT: stxvd2x 0, 0, 5 ; LE-64BIT-NEXT: blr ; -; BE-LABEL: ashr_32bytes: +; BE-LABEL: ashr_32bytes_dwordOff: ; BE: # %bb.0: ; BE-NEXT: ld 7, 0(3) ; BE-NEXT: ld 8, 8(3) ; BE-NEXT: ld 9, 16(3) ; BE-NEXT: ld 3, 24(3) ; BE-NEXT: lwz 4, 28(4) -; BE-NEXT: addi 6, 1, -64 -; BE-NEXT: std 3, 56(6) +; BE-NEXT: addi 6, 1, -32 +; BE-NEXT: std 3, -8(1) ; BE-NEXT: sradi 3, 7, 63 -; BE-NEXT: clrlwi 4, 4, 27 -; BE-NEXT: std 3, 24(6) -; BE-NEXT: std 3, 16(6) -; BE-NEXT: std 3, 8(6) +; BE-NEXT: rlwinm 4, 4, 3, 27, 28 +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 3, -48(1) +; BE-NEXT: std 3, -56(1) ; BE-NEXT: std 3, -64(1) ; BE-NEXT: neg 3, 4 -; BE-NEXT: std 9, 48(6) -; BE-NEXT: std 8, 40(6) -; BE-NEXT: std 7, 32(6) +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 8, -24(1) +; BE-NEXT: std 7, -32(1) ; BE-NEXT: extsw 3, 3 -; BE-NEXT: addi 4, 1, -32 -; BE-NEXT: ldux 3, 4, 3 -; BE-NEXT: ld 6, 8(4) -; BE-NEXT: ld 7, 24(4) -; BE-NEXT: ld 4, 16(4) +; BE-NEXT: ldux 3, 6, 3 +; BE-NEXT: ld 4, 8(6) +; BE-NEXT: ld 7, 24(6) +; BE-NEXT: ld 6, 16(6) ; BE-NEXT: std 3, 0(5) -; BE-NEXT: std 4, 16(5) +; BE-NEXT: std 6, 16(5) ; BE-NEXT: std 7, 24(5) -; BE-NEXT: std 6, 8(5) +; BE-NEXT: std 4, 8(5) ; BE-NEXT: blr ; -; LE-32BIT-LABEL: ashr_32bytes: +; LE-32BIT-LABEL: ashr_32bytes_dwordOff: ; LE-32BIT: # %bb.0: ; LE-32BIT-NEXT: stwu 1, -80(1) ; LE-32BIT-NEXT: lwz 7, 0(3) @@ -707,7 +1915,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 4, 28(4) ; LE-32BIT-NEXT: stw 3, 76(1) ; LE-32BIT-NEXT: srawi 3, 7, 31 -; LE-32BIT-NEXT: clrlwi 4, 4, 27 +; LE-32BIT-NEXT: rlwinm 4, 4, 3, 27, 28 ; LE-32BIT-NEXT: stw 0, 72(1) ; LE-32BIT-NEXT: stw 12, 68(1) ; LE-32BIT-NEXT: stw 11, 64(1) @@ -743,11 +1951,13 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: addi 1, 1, 80 ; LE-32BIT-NEXT: blr %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 %res = ashr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; LE: {{.*}} diff --git a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll index 044ddf562294..8e69547df6fc 100644 --- a/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll @@ -209,45 +209,41 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: stwu 1, -48(1) ; LE-32BIT-NEXT: lwz 7, 0(3) ; LE-32BIT-NEXT: li 6, 0 -; LE-32BIT-NEXT: lwz 4, 12(4) ; LE-32BIT-NEXT: lwz 8, 4(3) ; LE-32BIT-NEXT: lwz 9, 8(3) ; LE-32BIT-NEXT: lwz 3, 12(3) +; LE-32BIT-NEXT: lwz 4, 12(4) ; LE-32BIT-NEXT: stw 6, 28(1) ; LE-32BIT-NEXT: stw 6, 24(1) ; LE-32BIT-NEXT: stw 6, 20(1) ; LE-32BIT-NEXT: stw 6, 16(1) -; LE-32BIT-NEXT: addi 6, 1, 32 -; LE-32BIT-NEXT: stw 7, 32(1) -; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 31 +; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29 ; LE-32BIT-NEXT: stw 3, 44(1) -; LE-32BIT-NEXT: sub 6, 6, 7 +; LE-32BIT-NEXT: addi 3, 1, 32 ; LE-32BIT-NEXT: stw 9, 40(1) -; LE-32BIT-NEXT: li 3, 7 +; LE-32BIT-NEXT: sub 3, 3, 6 ; LE-32BIT-NEXT: stw 8, 36(1) -; LE-32BIT-NEXT: nand 3, 4, 3 -; LE-32BIT-NEXT: lwz 7, 4(6) -; LE-32BIT-NEXT: clrlwi 4, 4, 29 -; LE-32BIT-NEXT: lwz 8, 8(6) -; LE-32BIT-NEXT: subfic 10, 4, 32 -; LE-32BIT-NEXT: lwz 9, 0(6) -; LE-32BIT-NEXT: clrlwi 3, 3, 27 -; LE-32BIT-NEXT: lwz 6, 12(6) -; LE-32BIT-NEXT: srw 11, 8, 4 -; LE-32BIT-NEXT: slw 8, 8, 10 -; LE-32BIT-NEXT: slw 10, 9, 10 -; LE-32BIT-NEXT: srw 6, 6, 4 -; LE-32BIT-NEXT: srw 9, 9, 4 -; LE-32BIT-NEXT: srw 4, 7, 4 -; LE-32BIT-NEXT: slwi 7, 7, 1 -; LE-32BIT-NEXT: slw 3, 7, 3 -; LE-32BIT-NEXT: or 6, 8, 6 -; LE-32BIT-NEXT: or 4, 10, 4 -; LE-32BIT-NEXT: or 3, 11, 3 -; LE-32BIT-NEXT: stw 9, 0(5) -; LE-32BIT-NEXT: stw 6, 12(5) -; LE-32BIT-NEXT: stw 4, 4(5) +; LE-32BIT-NEXT: clrlwi 4, 4, 27 +; LE-32BIT-NEXT: stw 7, 32(1) +; LE-32BIT-NEXT: subfic 9, 4, 32 +; LE-32BIT-NEXT: lwz 6, 4(3) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: lwz 8, 12(3) +; LE-32BIT-NEXT: srw 10, 6, 4 +; LE-32BIT-NEXT: lwz 3, 8(3) +; LE-32BIT-NEXT: slw 11, 7, 9 +; LE-32BIT-NEXT: slw 6, 6, 9 +; LE-32BIT-NEXT: srw 8, 8, 4 +; LE-32BIT-NEXT: slw 9, 3, 9 +; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: or 3, 6, 3 ; LE-32BIT-NEXT: stw 3, 8(5) +; LE-32BIT-NEXT: or 3, 9, 8 +; LE-32BIT-NEXT: srw 4, 7, 4 +; LE-32BIT-NEXT: stw 3, 12(5) +; LE-32BIT-NEXT: or 3, 11, 10 +; LE-32BIT-NEXT: stw 4, 0(5) +; LE-32BIT-NEXT: stw 3, 4(5) ; LE-32BIT-NEXT: addi 1, 1, 48 ; LE-32BIT-NEXT: blr %src = load i128, ptr %src.ptr, align 1 @@ -304,34 +300,30 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: stw 6, 40(1) ; LE-32BIT-NEXT: stw 6, 36(1) ; LE-32BIT-NEXT: stw 6, 32(1) -; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 31 +; LE-32BIT-NEXT: rlwinm 6, 4, 29, 28, 29 ; LE-32BIT-NEXT: stw 3, 28(1) ; LE-32BIT-NEXT: addi 3, 1, 16 ; LE-32BIT-NEXT: stw 9, 24(1) +; LE-32BIT-NEXT: clrlwi 4, 4, 27 ; LE-32BIT-NEXT: stw 8, 20(1) +; LE-32BIT-NEXT: subfic 8, 4, 32 ; LE-32BIT-NEXT: stw 7, 16(1) -; LE-32BIT-NEXT: li 7, 7 ; LE-32BIT-NEXT: lwzux 3, 6, 3 -; LE-32BIT-NEXT: nand 7, 4, 7 -; LE-32BIT-NEXT: clrlwi 4, 4, 29 -; LE-32BIT-NEXT: subfic 10, 4, 32 -; LE-32BIT-NEXT: lwz 8, 8(6) -; LE-32BIT-NEXT: clrlwi 7, 7, 27 ; LE-32BIT-NEXT: lwz 9, 4(6) ; LE-32BIT-NEXT: slw 3, 3, 4 +; LE-32BIT-NEXT: lwz 7, 8(6) ; LE-32BIT-NEXT: lwz 6, 12(6) ; LE-32BIT-NEXT: slw 11, 9, 4 -; LE-32BIT-NEXT: srw 9, 9, 10 -; LE-32BIT-NEXT: srw 10, 6, 10 -; LE-32BIT-NEXT: slw 6, 6, 4 -; LE-32BIT-NEXT: slw 4, 8, 4 -; LE-32BIT-NEXT: srwi 8, 8, 1 -; LE-32BIT-NEXT: srw 7, 8, 7 +; LE-32BIT-NEXT: srw 9, 9, 8 +; LE-32BIT-NEXT: srw 10, 7, 8 +; LE-32BIT-NEXT: srw 8, 6, 8 +; LE-32BIT-NEXT: slw 7, 7, 4 +; LE-32BIT-NEXT: slw 4, 6, 4 ; LE-32BIT-NEXT: or 3, 3, 9 -; LE-32BIT-NEXT: or 4, 4, 10 +; LE-32BIT-NEXT: stw 4, 12(5) +; LE-32BIT-NEXT: or 4, 7, 8 ; LE-32BIT-NEXT: stw 3, 0(5) -; LE-32BIT-NEXT: or 3, 11, 7 -; LE-32BIT-NEXT: stw 6, 12(5) +; LE-32BIT-NEXT: or 3, 11, 10 ; LE-32BIT-NEXT: stw 4, 8(5) ; LE-32BIT-NEXT: stw 3, 4(5) ; LE-32BIT-NEXT: addi 1, 1, 48 @@ -387,46 +379,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT: # %bb.0: ; LE-32BIT-NEXT: stwu 1, -48(1) ; LE-32BIT-NEXT: lwz 7, 0(3) -; LE-32BIT-NEXT: li 6, 7 +; LE-32BIT-NEXT: addi 6, 1, 32 ; LE-32BIT-NEXT: lwz 8, 4(3) ; LE-32BIT-NEXT: lwz 9, 8(3) ; LE-32BIT-NEXT: lwz 3, 12(3) ; LE-32BIT-NEXT: lwz 4, 12(4) ; LE-32BIT-NEXT: stw 3, 44(1) ; LE-32BIT-NEXT: srawi 3, 7, 31 -; LE-32BIT-NEXT: stw 8, 36(1) -; LE-32BIT-NEXT: rlwinm 8, 4, 29, 28, 31 ; LE-32BIT-NEXT: stw 7, 32(1) -; LE-32BIT-NEXT: addi 7, 1, 32 +; LE-32BIT-NEXT: rlwinm 7, 4, 29, 28, 29 ; LE-32BIT-NEXT: stw 9, 40(1) -; LE-32BIT-NEXT: nand 6, 4, 6 +; LE-32BIT-NEXT: clrlwi 4, 4, 27 +; LE-32BIT-NEXT: stw 8, 36(1) +; LE-32BIT-NEXT: subfic 9, 4, 32 ; LE-32BIT-NEXT: stw 3, 28(1) -; LE-32BIT-NEXT: clrlwi 4, 4, 29 ; LE-32BIT-NEXT: stw 3, 24(1) -; LE-32BIT-NEXT: subfic 10, 4, 32 ; LE-32BIT-NEXT: stw 3, 20(1) -; LE-32BIT-NEXT: clrlwi 6, 6, 27 ; LE-32BIT-NEXT: stw 3, 16(1) -; LE-32BIT-NEXT: sub 3, 7, 8 -; LE-32BIT-NEXT: lwz 7, 4(3) -; LE-32BIT-NEXT: lwz 8, 8(3) -; LE-32BIT-NEXT: lwz 9, 0(3) -; LE-32BIT-NEXT: lwz 3, 12(3) -; LE-32BIT-NEXT: srw 11, 8, 4 -; LE-32BIT-NEXT: slw 8, 8, 10 -; LE-32BIT-NEXT: slw 10, 9, 10 +; LE-32BIT-NEXT: sub 3, 6, 7 +; LE-32BIT-NEXT: lwz 6, 4(3) +; LE-32BIT-NEXT: lwz 7, 0(3) +; LE-32BIT-NEXT: lwz 8, 12(3) +; LE-32BIT-NEXT: srw 10, 6, 4 +; LE-32BIT-NEXT: lwz 3, 8(3) +; LE-32BIT-NEXT: slw 11, 7, 9 +; LE-32BIT-NEXT: slw 6, 6, 9 +; LE-32BIT-NEXT: srw 8, 8, 4 +; LE-32BIT-NEXT: slw 9, 3, 9 ; LE-32BIT-NEXT: srw 3, 3, 4 -; LE-32BIT-NEXT: sraw 9, 9, 4 -; LE-32BIT-NEXT: srw 4, 7, 4 -; LE-32BIT-NEXT: slwi 7, 7, 1 -; LE-32BIT-NEXT: or 3, 8, 3 -; LE-32BIT-NEXT: slw 6, 7, 6 +; LE-32BIT-NEXT: or 3, 6, 3 +; LE-32BIT-NEXT: stw 3, 8(5) +; LE-32BIT-NEXT: or 3, 9, 8 +; LE-32BIT-NEXT: sraw 4, 7, 4 ; LE-32BIT-NEXT: stw 3, 12(5) -; LE-32BIT-NEXT: or 3, 10, 4 +; LE-32BIT-NEXT: or 3, 11, 10 +; LE-32BIT-NEXT: stw 4, 0(5) ; LE-32BIT-NEXT: stw 3, 4(5) -; LE-32BIT-NEXT: or 3, 11, 6 -; LE-32BIT-NEXT: stw 9, 0(5) -; LE-32BIT-NEXT: stw 3, 8(5) ; LE-32BIT-NEXT: addi 1, 1, 48 ; LE-32BIT-NEXT: blr %src = load i128, ptr %src.ptr, align 1 @@ -449,32 +437,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-64BIT-NEXT: li 4, 48 ; LE-64BIT-NEXT: stxvd2x 2, 7, 4 ; LE-64BIT-NEXT: stxvd2x 2, 7, 8 -; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 31 +; LE-64BIT-NEXT: rlwinm 4, 3, 29, 27, 28 +; LE-64BIT-NEXT: clrlwi 3, 3, 26 ; LE-64BIT-NEXT: stxvd2x 0, 7, 6 ; LE-64BIT-NEXT: stxvd2x 1, 0, 7 -; LE-64BIT-NEXT: li 6, 7 -; LE-64BIT-NEXT: ldux 7, 4, 7 -; LE-64BIT-NEXT: ld 8, 16(4) -; LE-64BIT-NEXT: nand 6, 3, 6 +; LE-64BIT-NEXT: xori 8, 3, 63 +; LE-64BIT-NEXT: ldux 6, 4, 7 +; LE-64BIT-NEXT: ld 7, 16(4) ; LE-64BIT-NEXT: ld 9, 8(4) -; LE-64BIT-NEXT: clrlwi 3, 3, 29 ; LE-64BIT-NEXT: ld 4, 24(4) -; LE-64BIT-NEXT: clrlwi 6, 6, 26 +; LE-64BIT-NEXT: srd 6, 6, 3 +; LE-64BIT-NEXT: sldi 11, 7, 1 +; LE-64BIT-NEXT: srd 10, 9, 3 ; LE-64BIT-NEXT: srd 7, 7, 3 -; LE-64BIT-NEXT: sldi 10, 8, 1 -; LE-64BIT-NEXT: srd 11, 9, 3 -; LE-64BIT-NEXT: srd 8, 8, 3 -; LE-64BIT-NEXT: sld 6, 10, 6 +; LE-64BIT-NEXT: sld 8, 11, 8 +; LE-64BIT-NEXT: or 8, 10, 8 ; LE-64BIT-NEXT: subfic 10, 3, 64 ; LE-64BIT-NEXT: srd 3, 4, 3 -; LE-64BIT-NEXT: or 6, 11, 6 ; LE-64BIT-NEXT: sld 11, 4, 10 ; LE-64BIT-NEXT: sld 9, 9, 10 ; LE-64BIT-NEXT: std 3, 24(5) -; LE-64BIT-NEXT: or 7, 9, 7 -; LE-64BIT-NEXT: or 3, 11, 8 -; LE-64BIT-NEXT: std 6, 8(5) -; LE-64BIT-NEXT: std 7, 0(5) +; LE-64BIT-NEXT: std 8, 8(5) +; LE-64BIT-NEXT: or 6, 9, 6 +; LE-64BIT-NEXT: or 3, 11, 7 +; LE-64BIT-NEXT: std 6, 0(5) ; LE-64BIT-NEXT: std 3, 16(5) ; LE-64BIT-NEXT: blr ; @@ -485,44 +471,39 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; BE-NEXT: ld 8, 16(3) ; BE-NEXT: ld 3, 24(3) ; BE-NEXT: lwz 4, 28(4) -; BE-NEXT: addi 9, 1, -64 -; BE-NEXT: li 10, 0 -; BE-NEXT: addi 11, 1, -32 -; BE-NEXT: std 3, 56(9) -; BE-NEXT: rlwinm 3, 4, 29, 27, 31 +; BE-NEXT: li 9, 0 +; BE-NEXT: addi 10, 1, -32 +; BE-NEXT: std 9, -40(1) +; BE-NEXT: std 9, -48(1) +; BE-NEXT: std 9, -56(1) +; BE-NEXT: std 9, -64(1) +; BE-NEXT: std 3, -8(1) +; BE-NEXT: rlwinm 3, 4, 29, 27, 28 ; BE-NEXT: neg 3, 3 -; BE-NEXT: std 10, 24(9) -; BE-NEXT: std 10, 16(9) -; BE-NEXT: std 10, 8(9) -; BE-NEXT: std 10, -64(1) -; BE-NEXT: std 8, 48(9) -; BE-NEXT: std 7, 40(9) -; BE-NEXT: std 6, 32(9) +; BE-NEXT: std 8, -16(1) +; BE-NEXT: std 7, -24(1) +; BE-NEXT: std 6, -32(1) ; BE-NEXT: extsw 3, 3 -; BE-NEXT: ldux 3, 11, 3 -; BE-NEXT: li 6, 7 -; BE-NEXT: nand 6, 4, 6 -; BE-NEXT: clrlwi 4, 4, 29 -; BE-NEXT: clrlwi 6, 6, 26 -; BE-NEXT: ld 7, 8(11) -; BE-NEXT: ld 8, 16(11) -; BE-NEXT: ld 9, 24(11) -; BE-NEXT: subfic 10, 4, 64 -; BE-NEXT: sldi 11, 7, 1 -; BE-NEXT: srd 7, 7, 4 -; BE-NEXT: srd 9, 9, 4 -; BE-NEXT: sld 6, 11, 6 -; BE-NEXT: sld 11, 3, 10 -; BE-NEXT: sld 10, 8, 10 -; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: ldux 3, 10, 3 +; BE-NEXT: clrlwi 4, 4, 26 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 6, 8(10) +; BE-NEXT: ld 7, 24(10) +; BE-NEXT: ld 8, 16(10) +; BE-NEXT: sld 10, 3, 9 ; BE-NEXT: srd 3, 3, 4 -; BE-NEXT: or 7, 11, 7 -; BE-NEXT: or 6, 8, 6 -; BE-NEXT: or 8, 10, 9 ; BE-NEXT: std 3, 0(5) -; BE-NEXT: std 8, 24(5) -; BE-NEXT: std 7, 8(5) +; BE-NEXT: srd 11, 6, 4 +; BE-NEXT: srd 7, 7, 4 +; BE-NEXT: sld 6, 6, 9 +; BE-NEXT: sld 9, 8, 9 +; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: or 10, 10, 11 +; BE-NEXT: or 7, 9, 7 +; BE-NEXT: or 6, 6, 8 ; BE-NEXT: std 6, 16(5) +; BE-NEXT: std 7, 24(5) +; BE-NEXT: std 10, 8(5) ; BE-NEXT: blr ; ; LE-32BIT-LABEL: lshr_32bytes: @@ -538,7 +519,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 0, 24(3) ; LE-32BIT-NEXT: lwz 3, 28(3) ; LE-32BIT-NEXT: lwz 4, 28(4) -; LE-32BIT-NEXT: stw 6, 48(1) ; LE-32BIT-NEXT: stw 6, 44(1) ; LE-32BIT-NEXT: stw 6, 40(1) ; LE-32BIT-NEXT: stw 6, 36(1) @@ -546,68 +526,65 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: stw 6, 28(1) ; LE-32BIT-NEXT: stw 6, 24(1) ; LE-32BIT-NEXT: stw 6, 20(1) -; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31 -; LE-32BIT-NEXT: stw 3, 80(1) -; LE-32BIT-NEXT: addi 3, 1, 52 +; LE-32BIT-NEXT: stw 6, 16(1) +; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29 +; LE-32BIT-NEXT: stw 3, 76(1) +; LE-32BIT-NEXT: addi 3, 1, 48 ; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: sub 3, 3, 6 ; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: clrlwi 4, 4, 27 ; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill -; LE-32BIT-NEXT: stw 0, 76(1) -; LE-32BIT-NEXT: stw 12, 72(1) -; LE-32BIT-NEXT: stw 11, 68(1) -; LE-32BIT-NEXT: stw 10, 64(1) -; LE-32BIT-NEXT: stw 9, 60(1) -; LE-32BIT-NEXT: li 9, 7 -; LE-32BIT-NEXT: stw 8, 56(1) -; LE-32BIT-NEXT: nand 9, 4, 9 -; LE-32BIT-NEXT: stw 7, 52(1) -; LE-32BIT-NEXT: clrlwi 4, 4, 29 -; LE-32BIT-NEXT: lwz 6, 4(3) ; LE-32BIT-NEXT: subfic 30, 4, 32 -; LE-32BIT-NEXT: lwz 7, 8(3) -; LE-32BIT-NEXT: clrlwi 9, 9, 27 -; LE-32BIT-NEXT: lwz 8, 12(3) -; LE-32BIT-NEXT: slwi 29, 6, 1 -; LE-32BIT-NEXT: lwz 10, 16(3) -; LE-32BIT-NEXT: srw 28, 7, 4 -; LE-32BIT-NEXT: lwz 11, 20(3) -; LE-32BIT-NEXT: slwi 27, 8, 1 -; LE-32BIT-NEXT: lwz 12, 24(3) +; LE-32BIT-NEXT: stw 0, 72(1) +; LE-32BIT-NEXT: stw 12, 68(1) +; LE-32BIT-NEXT: xori 12, 4, 31 +; LE-32BIT-NEXT: stw 11, 64(1) +; LE-32BIT-NEXT: stw 10, 60(1) +; LE-32BIT-NEXT: stw 9, 56(1) +; LE-32BIT-NEXT: stw 8, 52(1) +; LE-32BIT-NEXT: stw 7, 48(1) +; LE-32BIT-NEXT: lwz 6, 8(3) +; LE-32BIT-NEXT: lwz 7, 4(3) +; LE-32BIT-NEXT: lwz 8, 0(3) +; LE-32BIT-NEXT: srw 29, 6, 4 +; LE-32BIT-NEXT: lwz 9, 12(3) +; LE-32BIT-NEXT: slw 6, 6, 30 +; LE-32BIT-NEXT: lwz 10, 20(3) +; LE-32BIT-NEXT: slw 28, 8, 30 +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: srw 27, 9, 4 +; LE-32BIT-NEXT: lwz 0, 28(3) ; LE-32BIT-NEXT: srw 26, 10, 4 -; LE-32BIT-NEXT: lwz 0, 0(3) -; LE-32BIT-NEXT: srw 6, 6, 4 -; LE-32BIT-NEXT: lwz 3, 28(3) -; LE-32BIT-NEXT: srw 25, 12, 4 -; LE-32BIT-NEXT: slw 12, 12, 30 -; LE-32BIT-NEXT: slw 7, 7, 30 -; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: lwz 3, 24(3) +; LE-32BIT-NEXT: slw 25, 11, 30 +; LE-32BIT-NEXT: slw 9, 9, 30 ; LE-32BIT-NEXT: slw 10, 10, 30 -; LE-32BIT-NEXT: slw 30, 0, 30 -; LE-32BIT-NEXT: srw 8, 8, 4 +; LE-32BIT-NEXT: slw 30, 3, 30 +; LE-32BIT-NEXT: srw 3, 3, 4 ; LE-32BIT-NEXT: srw 0, 0, 4 -; LE-32BIT-NEXT: srw 4, 11, 4 -; LE-32BIT-NEXT: or 3, 12, 3 +; LE-32BIT-NEXT: or 3, 10, 3 +; LE-32BIT-NEXT: srw 11, 11, 4 +; LE-32BIT-NEXT: stw 3, 24(5) +; LE-32BIT-NEXT: or 3, 30, 0 ; LE-32BIT-NEXT: stw 3, 28(5) -; LE-32BIT-NEXT: or 3, 10, 4 -; LE-32BIT-NEXT: slwi 11, 11, 1 +; LE-32BIT-NEXT: or 3, 9, 11 +; LE-32BIT-NEXT: stw 3, 16(5) +; LE-32BIT-NEXT: or 3, 25, 26 +; LE-32BIT-NEXT: srw 8, 8, 4 +; LE-32BIT-NEXT: srw 4, 7, 4 +; LE-32BIT-NEXT: slwi 7, 7, 1 ; LE-32BIT-NEXT: stw 3, 20(5) -; LE-32BIT-NEXT: or 3, 7, 8 -; LE-32BIT-NEXT: slw 29, 29, 9 -; LE-32BIT-NEXT: slw 27, 27, 9 -; LE-32BIT-NEXT: slw 9, 11, 9 +; LE-32BIT-NEXT: or 3, 6, 27 +; LE-32BIT-NEXT: slw 7, 7, 12 ; LE-32BIT-NEXT: stw 3, 12(5) -; LE-32BIT-NEXT: or 3, 30, 6 +; LE-32BIT-NEXT: or 3, 28, 4 ; LE-32BIT-NEXT: stw 3, 4(5) -; LE-32BIT-NEXT: or 3, 25, 9 -; LE-32BIT-NEXT: stw 3, 24(5) -; LE-32BIT-NEXT: or 3, 26, 27 -; LE-32BIT-NEXT: stw 3, 16(5) -; LE-32BIT-NEXT: or 3, 28, 29 -; LE-32BIT-NEXT: stw 0, 0(5) +; LE-32BIT-NEXT: or 3, 29, 7 +; LE-32BIT-NEXT: stw 8, 0(5) ; LE-32BIT-NEXT: stw 3, 8(5) ; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload ; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload @@ -635,37 +612,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-64BIT-NEXT: lxvd2x 0, 3, 6 ; LE-64BIT-NEXT: stxvd2x 2, 7, 6 ; LE-64BIT-NEXT: li 6, 48 -; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 31 +; LE-64BIT-NEXT: rlwinm 3, 4, 29, 27, 28 +; LE-64BIT-NEXT: clrlwi 4, 4, 26 ; LE-64BIT-NEXT: neg 3, 3 ; LE-64BIT-NEXT: stxvd2x 0, 7, 6 ; LE-64BIT-NEXT: li 6, 32 ; LE-64BIT-NEXT: extsw 3, 3 ; LE-64BIT-NEXT: stxvd2x 1, 7, 6 ; LE-64BIT-NEXT: stxvd2x 2, 0, 7 -; LE-64BIT-NEXT: li 6, 7 +; LE-64BIT-NEXT: subfic 6, 4, 64 ; LE-64BIT-NEXT: ldux 3, 8, 3 -; LE-64BIT-NEXT: ld 7, 8(8) -; LE-64BIT-NEXT: nand 6, 4, 6 -; LE-64BIT-NEXT: ld 9, 16(8) -; LE-64BIT-NEXT: clrlwi 4, 4, 29 -; LE-64BIT-NEXT: ld 8, 24(8) -; LE-64BIT-NEXT: clrlwi 6, 6, 26 -; LE-64BIT-NEXT: rldicl 10, 7, 63, 1 -; LE-64BIT-NEXT: sld 8, 8, 4 +; LE-64BIT-NEXT: ld 7, 16(8) +; LE-64BIT-NEXT: ld 9, 24(8) +; LE-64BIT-NEXT: ld 8, 8(8) +; LE-64BIT-NEXT: srd 10, 7, 6 +; LE-64BIT-NEXT: sld 9, 9, 4 ; LE-64BIT-NEXT: sld 7, 7, 4 -; LE-64BIT-NEXT: srd 6, 10, 6 -; LE-64BIT-NEXT: sld 10, 9, 4 -; LE-64BIT-NEXT: or 6, 10, 6 -; LE-64BIT-NEXT: subfic 10, 4, 64 -; LE-64BIT-NEXT: srd 9, 9, 10 -; LE-64BIT-NEXT: srd 10, 3, 10 +; LE-64BIT-NEXT: or 9, 9, 10 +; LE-64BIT-NEXT: srd 10, 8, 6 +; LE-64BIT-NEXT: srd 6, 3, 6 +; LE-64BIT-NEXT: sld 8, 8, 4 ; LE-64BIT-NEXT: sld 3, 3, 4 -; LE-64BIT-NEXT: std 6, 16(5) -; LE-64BIT-NEXT: or 7, 7, 10 +; LE-64BIT-NEXT: or 6, 8, 6 ; LE-64BIT-NEXT: std 3, 0(5) -; LE-64BIT-NEXT: or 3, 8, 9 -; LE-64BIT-NEXT: std 7, 8(5) -; LE-64BIT-NEXT: std 3, 24(5) +; LE-64BIT-NEXT: or 3, 7, 10 +; LE-64BIT-NEXT: std 9, 24(5) +; LE-64BIT-NEXT: std 6, 8(5) +; LE-64BIT-NEXT: std 3, 16(5) ; LE-64BIT-NEXT: blr ; ; BE-LABEL: shl_32bytes: @@ -675,41 +648,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; BE-NEXT: ld 8, 16(3) ; BE-NEXT: ld 3, 24(3) ; BE-NEXT: lwz 4, 28(4) -; BE-NEXT: addi 9, 1, -64 -; BE-NEXT: li 10, 0 -; BE-NEXT: std 10, 56(9) -; BE-NEXT: std 10, 48(9) -; BE-NEXT: std 10, 40(9) -; BE-NEXT: std 10, 32(9) -; BE-NEXT: std 3, 24(9) -; BE-NEXT: std 8, 16(9) -; BE-NEXT: std 7, 8(9) +; BE-NEXT: li 9, 0 +; BE-NEXT: addi 10, 1, -64 +; BE-NEXT: std 9, -8(1) +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 9, -24(1) +; BE-NEXT: std 9, -32(1) +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 8, -48(1) +; BE-NEXT: std 7, -56(1) ; BE-NEXT: std 6, -64(1) -; BE-NEXT: rlwinm 3, 4, 29, 27, 31 -; BE-NEXT: ldux 6, 3, 9 -; BE-NEXT: li 7, 7 -; BE-NEXT: nand 7, 4, 7 -; BE-NEXT: clrlwi 4, 4, 29 -; BE-NEXT: clrlwi 7, 7, 26 -; BE-NEXT: ld 8, 16(3) -; BE-NEXT: ld 9, 8(3) +; BE-NEXT: rlwinm 3, 4, 29, 27, 28 +; BE-NEXT: ldux 6, 3, 10 +; BE-NEXT: clrlwi 4, 4, 26 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 7, 16(3) +; BE-NEXT: ld 8, 8(3) ; BE-NEXT: ld 3, 24(3) -; BE-NEXT: subfic 10, 4, 64 ; BE-NEXT: sld 6, 6, 4 -; BE-NEXT: rldicl 11, 8, 63, 1 -; BE-NEXT: sld 8, 8, 4 -; BE-NEXT: srd 7, 11, 7 -; BE-NEXT: srd 11, 9, 10 -; BE-NEXT: sld 9, 9, 4 -; BE-NEXT: srd 10, 3, 10 +; BE-NEXT: srd 10, 7, 9 +; BE-NEXT: sld 11, 8, 4 +; BE-NEXT: srd 8, 8, 9 +; BE-NEXT: srd 9, 3, 9 +; BE-NEXT: sld 7, 7, 4 ; BE-NEXT: sld 3, 3, 4 -; BE-NEXT: or 6, 6, 11 -; BE-NEXT: or 7, 9, 7 -; BE-NEXT: or 8, 8, 10 +; BE-NEXT: or 10, 11, 10 +; BE-NEXT: or 6, 6, 8 +; BE-NEXT: or 7, 7, 9 ; BE-NEXT: std 3, 24(5) -; BE-NEXT: std 8, 16(5) +; BE-NEXT: std 7, 16(5) ; BE-NEXT: std 6, 0(5) -; BE-NEXT: std 7, 8(5) +; BE-NEXT: std 10, 8(5) ; BE-NEXT: blr ; ; LE-32BIT-LABEL: shl_32bytes: @@ -731,7 +700,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill -; LE-32BIT-NEXT: stw 6, 80(1) ; LE-32BIT-NEXT: stw 6, 76(1) ; LE-32BIT-NEXT: stw 6, 72(1) ; LE-32BIT-NEXT: stw 6, 68(1) @@ -739,61 +707,56 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: stw 6, 60(1) ; LE-32BIT-NEXT: stw 6, 56(1) ; LE-32BIT-NEXT: stw 6, 52(1) -; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 31 -; LE-32BIT-NEXT: stw 3, 48(1) -; LE-32BIT-NEXT: addi 3, 1, 20 -; LE-32BIT-NEXT: stw 0, 44(1) -; LE-32BIT-NEXT: stw 12, 40(1) -; LE-32BIT-NEXT: stw 11, 36(1) -; LE-32BIT-NEXT: stw 10, 32(1) -; LE-32BIT-NEXT: stw 9, 28(1) -; LE-32BIT-NEXT: stw 8, 24(1) -; LE-32BIT-NEXT: li 8, 7 -; LE-32BIT-NEXT: stw 7, 20(1) -; LE-32BIT-NEXT: nand 8, 4, 8 +; LE-32BIT-NEXT: stw 6, 48(1) +; LE-32BIT-NEXT: rlwinm 6, 4, 29, 27, 29 +; LE-32BIT-NEXT: stw 3, 44(1) +; LE-32BIT-NEXT: addi 3, 1, 16 +; LE-32BIT-NEXT: stw 0, 40(1) +; LE-32BIT-NEXT: clrlwi 4, 4, 27 +; LE-32BIT-NEXT: stw 12, 36(1) +; LE-32BIT-NEXT: subfic 12, 4, 32 +; LE-32BIT-NEXT: stw 11, 32(1) +; LE-32BIT-NEXT: stw 10, 28(1) +; LE-32BIT-NEXT: stw 9, 24(1) +; LE-32BIT-NEXT: stw 8, 20(1) +; LE-32BIT-NEXT: stw 7, 16(1) ; LE-32BIT-NEXT: lwzux 3, 6, 3 -; LE-32BIT-NEXT: clrlwi 4, 4, 29 -; LE-32BIT-NEXT: subfic 0, 4, 32 -; LE-32BIT-NEXT: clrlwi 8, 8, 27 ; LE-32BIT-NEXT: lwz 7, 8(6) ; LE-32BIT-NEXT: slw 3, 3, 4 -; LE-32BIT-NEXT: lwz 9, 4(6) -; LE-32BIT-NEXT: lwz 10, 16(6) -; LE-32BIT-NEXT: srwi 29, 7, 1 -; LE-32BIT-NEXT: lwz 11, 12(6) -; LE-32BIT-NEXT: slw 28, 9, 4 -; LE-32BIT-NEXT: lwz 12, 24(6) -; LE-32BIT-NEXT: srwi 27, 10, 1 -; LE-32BIT-NEXT: lwz 30, 20(6) -; LE-32BIT-NEXT: slw 26, 11, 4 +; LE-32BIT-NEXT: lwz 8, 4(6) +; LE-32BIT-NEXT: lwz 9, 16(6) +; LE-32BIT-NEXT: srw 30, 7, 12 +; LE-32BIT-NEXT: lwz 10, 12(6) +; LE-32BIT-NEXT: slw 29, 8, 4 +; LE-32BIT-NEXT: lwz 11, 24(6) +; LE-32BIT-NEXT: srw 8, 8, 12 +; LE-32BIT-NEXT: lwz 0, 20(6) +; LE-32BIT-NEXT: srw 28, 9, 12 ; LE-32BIT-NEXT: lwz 6, 28(6) -; LE-32BIT-NEXT: srw 9, 9, 0 -; LE-32BIT-NEXT: slw 25, 30, 4 -; LE-32BIT-NEXT: srw 11, 11, 0 +; LE-32BIT-NEXT: slw 27, 10, 4 +; LE-32BIT-NEXT: srw 10, 10, 12 ; LE-32BIT-NEXT: slw 7, 7, 4 -; LE-32BIT-NEXT: srw 30, 30, 0 -; LE-32BIT-NEXT: slw 10, 10, 4 -; LE-32BIT-NEXT: srw 0, 6, 0 -; LE-32BIT-NEXT: slw 6, 6, 4 -; LE-32BIT-NEXT: slw 4, 12, 4 -; LE-32BIT-NEXT: srwi 12, 12, 1 -; LE-32BIT-NEXT: srw 29, 29, 8 -; LE-32BIT-NEXT: srw 27, 27, 8 -; LE-32BIT-NEXT: srw 8, 12, 8 -; LE-32BIT-NEXT: or 3, 3, 9 -; LE-32BIT-NEXT: or 4, 4, 0 -; LE-32BIT-NEXT: stw 3, 0(5) -; LE-32BIT-NEXT: or 3, 25, 8 +; LE-32BIT-NEXT: srw 26, 11, 12 +; LE-32BIT-NEXT: slw 25, 0, 4 +; LE-32BIT-NEXT: srw 0, 0, 12 +; LE-32BIT-NEXT: slw 9, 9, 4 +; LE-32BIT-NEXT: srw 12, 6, 12 +; LE-32BIT-NEXT: slw 11, 11, 4 +; LE-32BIT-NEXT: slw 4, 6, 4 +; LE-32BIT-NEXT: stw 4, 28(5) +; LE-32BIT-NEXT: or 4, 11, 12 ; LE-32BIT-NEXT: stw 4, 24(5) -; LE-32BIT-NEXT: or 4, 10, 30 -; LE-32BIT-NEXT: stw 3, 20(5) -; LE-32BIT-NEXT: or 3, 26, 27 +; LE-32BIT-NEXT: or 4, 9, 0 ; LE-32BIT-NEXT: stw 4, 16(5) -; LE-32BIT-NEXT: or 4, 7, 11 -; LE-32BIT-NEXT: stw 3, 12(5) -; LE-32BIT-NEXT: or 3, 28, 29 -; LE-32BIT-NEXT: stw 6, 28(5) +; LE-32BIT-NEXT: or 4, 25, 26 +; LE-32BIT-NEXT: stw 4, 20(5) +; LE-32BIT-NEXT: or 4, 7, 10 +; LE-32BIT-NEXT: or 3, 3, 8 ; LE-32BIT-NEXT: stw 4, 8(5) +; LE-32BIT-NEXT: or 4, 27, 28 +; LE-32BIT-NEXT: stw 3, 0(5) +; LE-32BIT-NEXT: or 3, 29, 30 +; LE-32BIT-NEXT: stw 4, 12(5) ; LE-32BIT-NEXT: stw 3, 4(5) ; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload ; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload @@ -812,98 +775,91 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-64BIT-LABEL: ashr_32bytes: ; LE-64BIT: # %bb.0: -; LE-64BIT-NEXT: lxvd2x 0, 0, 3 ; LE-64BIT-NEXT: ld 6, 24(3) +; LE-64BIT-NEXT: lxvd2x 0, 0, 3 ; LE-64BIT-NEXT: lwz 4, 0(4) ; LE-64BIT-NEXT: addi 7, 1, -64 ; LE-64BIT-NEXT: ld 3, 16(3) ; LE-64BIT-NEXT: sradi 8, 6, 63 -; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 31 -; LE-64BIT-NEXT: std 6, 24(7) -; LE-64BIT-NEXT: std 3, 16(7) -; LE-64BIT-NEXT: li 3, 7 -; LE-64BIT-NEXT: std 8, 56(7) -; LE-64BIT-NEXT: std 8, 48(7) -; LE-64BIT-NEXT: std 8, 40(7) -; LE-64BIT-NEXT: std 8, 32(7) +; LE-64BIT-NEXT: rlwinm 9, 4, 29, 27, 28 +; LE-64BIT-NEXT: clrlwi 4, 4, 26 ; LE-64BIT-NEXT: stxvd2x 0, 0, 7 -; LE-64BIT-NEXT: nand 3, 4, 3 -; LE-64BIT-NEXT: clrlwi 4, 4, 29 -; LE-64BIT-NEXT: ldux 6, 9, 7 -; LE-64BIT-NEXT: ld 7, 16(9) +; LE-64BIT-NEXT: std 6, -40(1) +; LE-64BIT-NEXT: std 3, -48(1) +; LE-64BIT-NEXT: std 8, -8(1) +; LE-64BIT-NEXT: std 8, -16(1) +; LE-64BIT-NEXT: std 8, -24(1) +; LE-64BIT-NEXT: std 8, -32(1) +; LE-64BIT-NEXT: ldux 3, 9, 7 +; LE-64BIT-NEXT: xori 7, 4, 63 +; LE-64BIT-NEXT: ld 6, 16(9) ; LE-64BIT-NEXT: ld 8, 8(9) -; LE-64BIT-NEXT: clrlwi 3, 3, 26 ; LE-64BIT-NEXT: ld 9, 24(9) +; LE-64BIT-NEXT: srd 3, 3, 4 +; LE-64BIT-NEXT: sldi 11, 6, 1 +; LE-64BIT-NEXT: srd 10, 8, 4 ; LE-64BIT-NEXT: srd 6, 6, 4 -; LE-64BIT-NEXT: sldi 10, 7, 1 -; LE-64BIT-NEXT: srd 11, 8, 4 -; LE-64BIT-NEXT: srd 7, 7, 4 -; LE-64BIT-NEXT: sld 3, 10, 3 +; LE-64BIT-NEXT: sld 7, 11, 7 +; LE-64BIT-NEXT: or 7, 10, 7 ; LE-64BIT-NEXT: subfic 10, 4, 64 ; LE-64BIT-NEXT: srad 4, 9, 4 -; LE-64BIT-NEXT: or 3, 11, 3 -; LE-64BIT-NEXT: sld 11, 9, 10 ; LE-64BIT-NEXT: sld 8, 8, 10 +; LE-64BIT-NEXT: sld 11, 9, 10 ; LE-64BIT-NEXT: std 4, 24(5) -; LE-64BIT-NEXT: or 6, 8, 6 -; LE-64BIT-NEXT: or 4, 11, 7 -; LE-64BIT-NEXT: std 3, 8(5) -; LE-64BIT-NEXT: std 6, 0(5) -; LE-64BIT-NEXT: std 4, 16(5) +; LE-64BIT-NEXT: std 7, 8(5) +; LE-64BIT-NEXT: or 3, 8, 3 +; LE-64BIT-NEXT: std 3, 0(5) +; LE-64BIT-NEXT: or 3, 11, 6 +; LE-64BIT-NEXT: std 3, 16(5) ; LE-64BIT-NEXT: blr ; ; BE-LABEL: ashr_32bytes: ; BE: # %bb.0: -; BE-NEXT: ld 6, 0(3) -; BE-NEXT: ld 7, 8(3) -; BE-NEXT: ld 8, 16(3) +; BE-NEXT: ld 7, 0(3) +; BE-NEXT: ld 8, 8(3) +; BE-NEXT: ld 9, 16(3) ; BE-NEXT: ld 3, 24(3) ; BE-NEXT: lwz 4, 28(4) -; BE-NEXT: addi 9, 1, -64 -; BE-NEXT: addi 10, 1, -32 -; BE-NEXT: std 3, 56(9) -; BE-NEXT: std 6, 32(9) -; BE-NEXT: sradi 3, 6, 63 -; BE-NEXT: rlwinm 6, 4, 29, 27, 31 -; BE-NEXT: std 3, 24(9) -; BE-NEXT: std 3, 16(9) -; BE-NEXT: std 3, 8(9) +; BE-NEXT: addi 6, 1, -32 +; BE-NEXT: std 3, -8(1) +; BE-NEXT: std 7, -32(1) +; BE-NEXT: sradi 3, 7, 63 +; BE-NEXT: rlwinm 7, 4, 29, 27, 28 +; BE-NEXT: std 3, -40(1) +; BE-NEXT: std 3, -48(1) +; BE-NEXT: std 3, -56(1) ; BE-NEXT: std 3, -64(1) -; BE-NEXT: neg 3, 6 -; BE-NEXT: std 8, 48(9) -; BE-NEXT: std 7, 40(9) +; BE-NEXT: neg 3, 7 +; BE-NEXT: std 9, -16(1) +; BE-NEXT: std 8, -24(1) ; BE-NEXT: extsw 3, 3 -; BE-NEXT: ldux 3, 10, 3 -; BE-NEXT: li 6, 7 -; BE-NEXT: nand 6, 4, 6 -; BE-NEXT: clrlwi 4, 4, 29 -; BE-NEXT: clrlwi 6, 6, 26 -; BE-NEXT: ld 7, 8(10) -; BE-NEXT: ld 8, 16(10) -; BE-NEXT: ld 9, 24(10) -; BE-NEXT: subfic 10, 4, 64 -; BE-NEXT: sldi 11, 7, 1 -; BE-NEXT: srd 7, 7, 4 -; BE-NEXT: srd 9, 9, 4 -; BE-NEXT: sld 6, 11, 6 -; BE-NEXT: sld 11, 3, 10 -; BE-NEXT: sld 10, 8, 10 -; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: ldux 3, 6, 3 +; BE-NEXT: clrlwi 4, 4, 26 +; BE-NEXT: subfic 9, 4, 64 +; BE-NEXT: ld 7, 8(6) +; BE-NEXT: ld 8, 24(6) +; BE-NEXT: ld 6, 16(6) +; BE-NEXT: sld 10, 3, 9 ; BE-NEXT: srad 3, 3, 4 -; BE-NEXT: or 7, 11, 7 -; BE-NEXT: or 6, 8, 6 -; BE-NEXT: or 8, 10, 9 ; BE-NEXT: std 3, 0(5) -; BE-NEXT: std 8, 24(5) -; BE-NEXT: std 7, 8(5) +; BE-NEXT: srd 11, 7, 4 +; BE-NEXT: srd 8, 8, 4 +; BE-NEXT: sld 7, 7, 9 +; BE-NEXT: sld 9, 6, 9 +; BE-NEXT: srd 6, 6, 4 +; BE-NEXT: or 10, 10, 11 +; BE-NEXT: or 8, 9, 8 +; BE-NEXT: or 6, 7, 6 ; BE-NEXT: std 6, 16(5) +; BE-NEXT: std 8, 24(5) +; BE-NEXT: std 10, 8(5) ; BE-NEXT: blr ; ; LE-32BIT-LABEL: ashr_32bytes: ; LE-32BIT: # %bb.0: ; LE-32BIT-NEXT: stwu 1, -112(1) ; LE-32BIT-NEXT: lwz 7, 0(3) -; LE-32BIT-NEXT: addi 6, 1, 52 +; LE-32BIT-NEXT: addi 6, 1, 48 ; LE-32BIT-NEXT: lwz 8, 4(3) ; LE-32BIT-NEXT: lwz 9, 8(3) ; LE-32BIT-NEXT: lwz 10, 12(3) @@ -912,76 +868,72 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; LE-32BIT-NEXT: lwz 0, 24(3) ; LE-32BIT-NEXT: lwz 3, 28(3) ; LE-32BIT-NEXT: lwz 4, 28(4) -; LE-32BIT-NEXT: stw 3, 80(1) +; LE-32BIT-NEXT: stw 3, 76(1) ; LE-32BIT-NEXT: srawi 3, 7, 31 -; LE-32BIT-NEXT: stw 7, 52(1) -; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 31 +; LE-32BIT-NEXT: stw 7, 48(1) +; LE-32BIT-NEXT: rlwinm 7, 4, 29, 27, 29 ; LE-32BIT-NEXT: stw 25, 84(1) # 4-byte Folded Spill +; LE-32BIT-NEXT: clrlwi 4, 4, 27 ; LE-32BIT-NEXT: stw 26, 88(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 27, 92(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 28, 96(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 29, 100(1) # 4-byte Folded Spill ; LE-32BIT-NEXT: stw 30, 104(1) # 4-byte Folded Spill -; LE-32BIT-NEXT: stw 0, 76(1) -; LE-32BIT-NEXT: stw 12, 72(1) -; LE-32BIT-NEXT: stw 11, 68(1) -; LE-32BIT-NEXT: stw 10, 64(1) -; LE-32BIT-NEXT: stw 9, 60(1) -; LE-32BIT-NEXT: li 9, 7 -; LE-32BIT-NEXT: stw 8, 56(1) -; LE-32BIT-NEXT: nand 9, 4, 9 -; LE-32BIT-NEXT: stw 3, 48(1) -; LE-32BIT-NEXT: clrlwi 4, 4, 29 -; LE-32BIT-NEXT: stw 3, 44(1) ; LE-32BIT-NEXT: subfic 30, 4, 32 +; LE-32BIT-NEXT: stw 0, 72(1) +; LE-32BIT-NEXT: stw 12, 68(1) +; LE-32BIT-NEXT: xori 12, 4, 31 +; LE-32BIT-NEXT: stw 11, 64(1) +; LE-32BIT-NEXT: stw 10, 60(1) +; LE-32BIT-NEXT: stw 9, 56(1) +; LE-32BIT-NEXT: stw 8, 52(1) +; LE-32BIT-NEXT: stw 3, 44(1) ; LE-32BIT-NEXT: stw 3, 40(1) -; LE-32BIT-NEXT: clrlwi 9, 9, 27 ; LE-32BIT-NEXT: stw 3, 36(1) ; LE-32BIT-NEXT: stw 3, 32(1) ; LE-32BIT-NEXT: stw 3, 28(1) ; LE-32BIT-NEXT: stw 3, 24(1) ; LE-32BIT-NEXT: stw 3, 20(1) +; LE-32BIT-NEXT: stw 3, 16(1) ; LE-32BIT-NEXT: sub 3, 6, 7 -; LE-32BIT-NEXT: lwz 6, 4(3) -; LE-32BIT-NEXT: lwz 7, 8(3) -; LE-32BIT-NEXT: lwz 8, 12(3) -; LE-32BIT-NEXT: slwi 29, 6, 1 -; LE-32BIT-NEXT: lwz 10, 16(3) -; LE-32BIT-NEXT: srw 28, 7, 4 -; LE-32BIT-NEXT: lwz 11, 20(3) -; LE-32BIT-NEXT: slwi 27, 8, 1 -; LE-32BIT-NEXT: lwz 12, 24(3) +; LE-32BIT-NEXT: lwz 6, 8(3) +; LE-32BIT-NEXT: lwz 7, 4(3) +; LE-32BIT-NEXT: lwz 8, 0(3) +; LE-32BIT-NEXT: srw 29, 6, 4 +; LE-32BIT-NEXT: lwz 9, 12(3) +; LE-32BIT-NEXT: slw 6, 6, 30 +; LE-32BIT-NEXT: lwz 10, 20(3) +; LE-32BIT-NEXT: slw 28, 8, 30 +; LE-32BIT-NEXT: lwz 11, 16(3) +; LE-32BIT-NEXT: srw 27, 9, 4 +; LE-32BIT-NEXT: lwz 0, 28(3) ; LE-32BIT-NEXT: srw 26, 10, 4 -; LE-32BIT-NEXT: lwz 0, 0(3) -; LE-32BIT-NEXT: srw 6, 6, 4 -; LE-32BIT-NEXT: lwz 3, 28(3) -; LE-32BIT-NEXT: srw 25, 12, 4 -; LE-32BIT-NEXT: slw 12, 12, 30 -; LE-32BIT-NEXT: slw 7, 7, 30 -; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: lwz 3, 24(3) +; LE-32BIT-NEXT: slw 25, 11, 30 +; LE-32BIT-NEXT: slw 9, 9, 30 ; LE-32BIT-NEXT: slw 10, 10, 30 -; LE-32BIT-NEXT: slw 30, 0, 30 -; LE-32BIT-NEXT: srw 8, 8, 4 -; LE-32BIT-NEXT: sraw 0, 0, 4 -; LE-32BIT-NEXT: srw 4, 11, 4 -; LE-32BIT-NEXT: or 3, 12, 3 +; LE-32BIT-NEXT: slw 30, 3, 30 +; LE-32BIT-NEXT: srw 3, 3, 4 +; LE-32BIT-NEXT: srw 0, 0, 4 +; LE-32BIT-NEXT: or 3, 10, 3 +; LE-32BIT-NEXT: srw 11, 11, 4 +; LE-32BIT-NEXT: stw 3, 24(5) +; LE-32BIT-NEXT: or 3, 30, 0 ; LE-32BIT-NEXT: stw 3, 28(5) -; LE-32BIT-NEXT: or 3, 10, 4 -; LE-32BIT-NEXT: slwi 11, 11, 1 +; LE-32BIT-NEXT: or 3, 9, 11 +; LE-32BIT-NEXT: stw 3, 16(5) +; LE-32BIT-NEXT: or 3, 25, 26 +; LE-32BIT-NEXT: sraw 8, 8, 4 +; LE-32BIT-NEXT: srw 4, 7, 4 +; LE-32BIT-NEXT: slwi 7, 7, 1 ; LE-32BIT-NEXT: stw 3, 20(5) -; LE-32BIT-NEXT: or 3, 7, 8 -; LE-32BIT-NEXT: slw 29, 29, 9 -; LE-32BIT-NEXT: slw 27, 27, 9 -; LE-32BIT-NEXT: slw 9, 11, 9 +; LE-32BIT-NEXT: or 3, 6, 27 +; LE-32BIT-NEXT: slw 7, 7, 12 ; LE-32BIT-NEXT: stw 3, 12(5) -; LE-32BIT-NEXT: or 3, 30, 6 +; LE-32BIT-NEXT: or 3, 28, 4 ; LE-32BIT-NEXT: stw 3, 4(5) -; LE-32BIT-NEXT: or 3, 25, 9 -; LE-32BIT-NEXT: stw 3, 24(5) -; LE-32BIT-NEXT: or 3, 26, 27 -; LE-32BIT-NEXT: stw 3, 16(5) -; LE-32BIT-NEXT: or 3, 28, 29 -; LE-32BIT-NEXT: stw 0, 0(5) +; LE-32BIT-NEXT: or 3, 29, 7 +; LE-32BIT-NEXT: stw 8, 0(5) ; LE-32BIT-NEXT: stw 3, 8(5) ; LE-32BIT-NEXT: lwz 30, 104(1) # 4-byte Folded Reload ; LE-32BIT-NEXT: lwz 29, 100(1) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index f61cbfd3ed72..5ba8755201dd 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -157,106 +157,33 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: sb zero, 31(sp) -; RV32I-NEXT: sb zero, 30(sp) -; RV32I-NEXT: sb zero, 29(sp) -; RV32I-NEXT: sb zero, 28(sp) -; RV32I-NEXT: sb zero, 27(sp) -; RV32I-NEXT: sb zero, 26(sp) -; RV32I-NEXT: sb zero, 25(sp) -; RV32I-NEXT: sb zero, 24(sp) -; RV32I-NEXT: sb zero, 23(sp) -; RV32I-NEXT: sb zero, 22(sp) -; RV32I-NEXT: sb zero, 21(sp) -; RV32I-NEXT: sb zero, 20(sp) -; RV32I-NEXT: sb zero, 19(sp) -; RV32I-NEXT: sb zero, 18(sp) -; RV32I-NEXT: sb zero, 17(sp) -; RV32I-NEXT: sb zero, 16(sp) -; RV32I-NEXT: sb a1, 12(sp) -; RV32I-NEXT: sb a5, 8(sp) -; RV32I-NEXT: sb a4, 4(sp) -; RV32I-NEXT: sb a3, 0(sp) -; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: sb a6, 15(sp) -; RV32I-NEXT: srli a6, a1, 16 -; RV32I-NEXT: sb a6, 14(sp) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 13(sp) -; RV32I-NEXT: srli a1, a5, 24 -; RV32I-NEXT: sb a1, 11(sp) -; RV32I-NEXT: srli a1, a5, 16 -; RV32I-NEXT: sb a1, 10(sp) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 9(sp) -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: sb a1, 7(sp) -; RV32I-NEXT: srli a1, a4, 16 -; RV32I-NEXT: sb a1, 6(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 5(sp) -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: sb a1, 3(sp) -; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: sb a1, 2(sp) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 1(sp) -; RV32I-NEXT: slli a1, a2, 25 -; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw a1, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: srli a1, a2, 3 +; RV32I-NEXT: andi a1, a1, 12 ; RV32I-NEXT: mv a3, sp ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: lbu a3, 1(a1) -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a6, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: andi a2, a2, 7 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: srl a3, a3, a2 -; RV32I-NEXT: lbu a4, 5(a1) -; RV32I-NEXT: lbu a5, 4(a1) -; RV32I-NEXT: lbu a6, 6(a1) -; RV32I-NEXT: lbu a7, 7(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a5, a7, a6 -; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a5, a4, 1 -; RV32I-NEXT: xori a6, a2, 31 +; RV32I-NEXT: andi a6, a2, 31 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: lw a7, 8(a1) ; RV32I-NEXT: sll a5, a5, a6 ; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: srl a4, a4, a2 -; RV32I-NEXT: lbu a5, 9(a1) -; RV32I-NEXT: lbu a7, 8(a1) -; RV32I-NEXT: lbu t0, 10(a1) -; RV32I-NEXT: lbu t1, 11(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: not t0, a2 -; RV32I-NEXT: lbu t1, 13(a1) -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: or a4, a4, a7 -; RV32I-NEXT: lbu a7, 12(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: lbu t0, 14(a1) -; RV32I-NEXT: lbu a1, 15(a1) -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: srl a5, a5, a2 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli a5, a7, 1 +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sll a5, a5, a6 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: srl a5, a7, a2 ; RV32I-NEXT: slli a7, a1, 1 ; RV32I-NEXT: sll a6, a7, a6 ; RV32I-NEXT: or a5, a5, a6 @@ -299,110 +226,34 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 8(a1) ; RV32I-NEXT: lw a5, 4(a1) ; RV32I-NEXT: lw a1, 0(a1) -; RV32I-NEXT: sb a3, 12(sp) -; RV32I-NEXT: sb a4, 8(sp) -; RV32I-NEXT: sb a5, 4(sp) -; RV32I-NEXT: sb a1, 0(sp) -; RV32I-NEXT: srai a6, a3, 31 -; RV32I-NEXT: sb a6, 28(sp) -; RV32I-NEXT: sb a6, 24(sp) -; RV32I-NEXT: sb a6, 20(sp) -; RV32I-NEXT: sb a6, 16(sp) -; RV32I-NEXT: srli a7, a3, 24 -; RV32I-NEXT: sb a7, 15(sp) -; RV32I-NEXT: srli a7, a3, 16 -; RV32I-NEXT: sb a7, 14(sp) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 13(sp) -; RV32I-NEXT: srli a3, a4, 24 -; RV32I-NEXT: sb a3, 11(sp) -; RV32I-NEXT: srli a3, a4, 16 -; RV32I-NEXT: sb a3, 10(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 9(sp) -; RV32I-NEXT: srli a3, a5, 24 -; RV32I-NEXT: sb a3, 7(sp) -; RV32I-NEXT: srli a3, a5, 16 -; RV32I-NEXT: sb a3, 6(sp) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 5(sp) -; RV32I-NEXT: srli a3, a1, 24 -; RV32I-NEXT: sb a3, 3(sp) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 2(sp) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 1(sp) -; RV32I-NEXT: srli a1, a6, 24 -; RV32I-NEXT: sb a1, 31(sp) -; RV32I-NEXT: srli a3, a6, 16 -; RV32I-NEXT: sb a3, 30(sp) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a1, 27(sp) -; RV32I-NEXT: sb a3, 26(sp) -; RV32I-NEXT: sb a4, 25(sp) -; RV32I-NEXT: sb a1, 23(sp) -; RV32I-NEXT: sb a3, 22(sp) -; RV32I-NEXT: sb a4, 21(sp) -; RV32I-NEXT: sb a1, 19(sp) -; RV32I-NEXT: sb a3, 18(sp) -; RV32I-NEXT: sb a4, 17(sp) -; RV32I-NEXT: slli a1, a2, 25 -; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: sw a3, 12(sp) +; RV32I-NEXT: sw a4, 8(sp) +; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a1, 0(sp) +; RV32I-NEXT: srai a3, a3, 31 +; RV32I-NEXT: sw a3, 28(sp) +; RV32I-NEXT: sw a3, 24(sp) +; RV32I-NEXT: sw a3, 20(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: srli a1, a2, 3 +; RV32I-NEXT: andi a1, a1, 12 ; RV32I-NEXT: mv a3, sp ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: lbu a3, 1(a1) -; RV32I-NEXT: lbu a4, 0(a1) -; RV32I-NEXT: lbu a5, 2(a1) -; RV32I-NEXT: lbu a6, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: andi a2, a2, 7 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: srl a3, a3, a2 -; RV32I-NEXT: lbu a4, 5(a1) -; RV32I-NEXT: lbu a5, 4(a1) -; RV32I-NEXT: lbu a6, 6(a1) -; RV32I-NEXT: lbu a7, 7(a1) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a5 -; RV32I-NEXT: slli a6, a6, 16 -; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a5, a7, a6 -; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a5, a4, 1 -; RV32I-NEXT: xori a6, a2, 31 +; RV32I-NEXT: andi a6, a2, 31 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: lw a7, 8(a1) ; RV32I-NEXT: sll a5, a5, a6 ; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: srl a4, a4, a2 -; RV32I-NEXT: lbu a5, 9(a1) -; RV32I-NEXT: lbu a7, 8(a1) -; RV32I-NEXT: lbu t0, 10(a1) -; RV32I-NEXT: lbu t1, 11(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a7, t1, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: not t0, a2 -; RV32I-NEXT: lbu t1, 13(a1) -; RV32I-NEXT: sll a7, a7, t0 -; RV32I-NEXT: or a4, a4, a7 -; RV32I-NEXT: lbu a7, 12(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: lbu t0, 14(a1) -; RV32I-NEXT: lbu a1, 15(a1) -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: srl a5, a5, a2 -; RV32I-NEXT: slli t0, t0, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: slli a5, a7, 1 +; RV32I-NEXT: lw a1, 12(a1) +; RV32I-NEXT: sll a5, a5, a6 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: srl a5, a7, a2 ; RV32I-NEXT: slli a7, a1, 1 ; RV32I-NEXT: sll a6, a7, a6 ; RV32I-NEXT: or a5, a5, a6 @@ -445,114 +296,41 @@ define i128 @shl128(i128 %a, i128 %b) nounwind { ; RV32I-NEXT: lw a4, 4(a1) ; RV32I-NEXT: lw a5, 8(a1) ; RV32I-NEXT: lw a1, 12(a1) -; RV32I-NEXT: sb zero, 15(sp) -; RV32I-NEXT: sb zero, 14(sp) -; RV32I-NEXT: sb zero, 13(sp) -; RV32I-NEXT: sb zero, 12(sp) -; RV32I-NEXT: sb zero, 11(sp) -; RV32I-NEXT: sb zero, 10(sp) -; RV32I-NEXT: sb zero, 9(sp) -; RV32I-NEXT: sb zero, 8(sp) -; RV32I-NEXT: sb zero, 7(sp) -; RV32I-NEXT: sb zero, 6(sp) -; RV32I-NEXT: sb zero, 5(sp) -; RV32I-NEXT: sb zero, 4(sp) -; RV32I-NEXT: sb zero, 3(sp) -; RV32I-NEXT: sb zero, 2(sp) -; RV32I-NEXT: sb zero, 1(sp) -; RV32I-NEXT: sb zero, 0(sp) -; RV32I-NEXT: sb a1, 28(sp) -; RV32I-NEXT: sb a5, 24(sp) -; RV32I-NEXT: sb a4, 20(sp) -; RV32I-NEXT: sb a3, 16(sp) -; RV32I-NEXT: srli a6, a1, 24 -; RV32I-NEXT: sb a6, 31(sp) -; RV32I-NEXT: srli a6, a1, 16 -; RV32I-NEXT: sb a6, 30(sp) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 29(sp) -; RV32I-NEXT: srli a1, a5, 24 -; RV32I-NEXT: sb a1, 27(sp) -; RV32I-NEXT: srli a1, a5, 16 -; RV32I-NEXT: sb a1, 26(sp) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 25(sp) -; RV32I-NEXT: srli a1, a4, 24 -; RV32I-NEXT: sb a1, 23(sp) -; RV32I-NEXT: srli a1, a4, 16 -; RV32I-NEXT: sb a1, 22(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 21(sp) -; RV32I-NEXT: srli a1, a3, 24 -; RV32I-NEXT: sb a1, 19(sp) -; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: sb a1, 18(sp) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 17(sp) -; RV32I-NEXT: slli a1, a2, 25 -; RV32I-NEXT: srli a1, a1, 28 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a1, 28(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a4, 20(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: srli a1, a2, 3 +; RV32I-NEXT: andi a1, a1, 12 ; RV32I-NEXT: addi a3, sp, 16 -; RV32I-NEXT: sub a1, a3, a1 -; RV32I-NEXT: lbu a3, 5(a1) -; RV32I-NEXT: lbu a4, 4(a1) -; RV32I-NEXT: lbu a5, 6(a1) -; RV32I-NEXT: lbu a6, 7(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: andi a2, a2, 7 -; RV32I-NEXT: sll a4, a3, a2 -; RV32I-NEXT: lbu a5, 1(a1) -; RV32I-NEXT: lbu a6, 0(a1) -; RV32I-NEXT: lbu a7, 2(a1) -; RV32I-NEXT: lbu t0, 3(a1) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: srli a6, a5, 1 -; RV32I-NEXT: xori a7, a2, 31 +; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: lw a1, 4(a3) +; RV32I-NEXT: lw a4, 0(a3) +; RV32I-NEXT: sll a5, a1, a2 +; RV32I-NEXT: srli a6, a4, 1 +; RV32I-NEXT: andi a7, a2, 31 +; RV32I-NEXT: lw t0, 8(a3) +; RV32I-NEXT: xori a7, a7, 31 ; RV32I-NEXT: srl a6, a6, a7 -; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: lbu a6, 9(a1) -; RV32I-NEXT: lbu t0, 8(a1) -; RV32I-NEXT: lbu t1, 10(a1) -; RV32I-NEXT: lbu t2, 11(a1) -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a6, a6, t0 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a6, t0, a6 -; RV32I-NEXT: sll t0, a6, a2 -; RV32I-NEXT: srli a3, a3, 1 -; RV32I-NEXT: not t1, a2 -; RV32I-NEXT: srl a3, a3, t1 -; RV32I-NEXT: or a3, t0, a3 -; RV32I-NEXT: lbu t0, 13(a1) -; RV32I-NEXT: lbu t1, 12(a1) -; RV32I-NEXT: lbu t2, 14(a1) -; RV32I-NEXT: lbu a1, 15(a1) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t2 -; RV32I-NEXT: or a1, a1, t0 -; RV32I-NEXT: sll a1, a1, a2 -; RV32I-NEXT: srli a6, a6, 1 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: sll a6, t0, a2 +; RV32I-NEXT: lw a3, 12(a3) +; RV32I-NEXT: srli a1, a1, 1 +; RV32I-NEXT: srl a1, a1, a7 +; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: sll a3, a3, a2 +; RV32I-NEXT: srli a6, t0, 1 ; RV32I-NEXT: srl a6, a6, a7 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: sll a2, a5, a2 +; RV32I-NEXT: or a3, a3, a6 +; RV32I-NEXT: sll a2, a4, a2 ; RV32I-NEXT: sw a2, 0(a0) -; RV32I-NEXT: sw a1, 12(a0) -; RV32I-NEXT: sw a3, 8(a0) -; RV32I-NEXT: sw a4, 4(a0) +; RV32I-NEXT: sw a3, 12(a0) +; RV32I-NEXT: sw a1, 8(a0) +; RV32I-NEXT: sw a5, 4(a0) ; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index b0d435368e92..29fe0a7de6b3 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -723,98 +723,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb zero, 35(sp) -; RV32I-NEXT: sb zero, 34(sp) -; RV32I-NEXT: sb zero, 33(sp) -; RV32I-NEXT: sb zero, 32(sp) -; RV32I-NEXT: sb zero, 31(sp) -; RV32I-NEXT: sb zero, 30(sp) -; RV32I-NEXT: sb zero, 29(sp) -; RV32I-NEXT: sb zero, 28(sp) -; RV32I-NEXT: sb zero, 27(sp) -; RV32I-NEXT: sb zero, 26(sp) -; RV32I-NEXT: sb zero, 25(sp) -; RV32I-NEXT: sb zero, 24(sp) -; RV32I-NEXT: sb zero, 23(sp) -; RV32I-NEXT: sb zero, 22(sp) -; RV32I-NEXT: sb zero, 21(sp) -; RV32I-NEXT: sb zero, 20(sp) -; RV32I-NEXT: sb a0, 19(sp) -; RV32I-NEXT: sb s2, 18(sp) -; RV32I-NEXT: sb s1, 17(sp) -; RV32I-NEXT: sb s0, 16(sp) -; RV32I-NEXT: sb t6, 15(sp) -; RV32I-NEXT: sb t5, 14(sp) -; RV32I-NEXT: sb t4, 13(sp) -; RV32I-NEXT: sb t3, 12(sp) -; RV32I-NEXT: sb t2, 11(sp) -; RV32I-NEXT: sb t1, 10(sp) -; RV32I-NEXT: sb t0, 9(sp) -; RV32I-NEXT: sb a7, 8(sp) -; RV32I-NEXT: sb a6, 7(sp) -; RV32I-NEXT: sb a5, 6(sp) -; RV32I-NEXT: sb a4, 5(sp) -; RV32I-NEXT: sb a3, 4(sp) -; RV32I-NEXT: andi a1, a1, 15 -; RV32I-NEXT: addi a0, sp, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: andi a0, a1, 12 +; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: srl a4, a3, a1 +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: andi a6, a1, 24 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: lw a7, 0(a0) +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or t0, a4, t0 +; RV32I-NEXT: srl a7, a7, a1 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: sll a3, a3, a6 +; RV32I-NEXT: or a3, a7, a3 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli t1, a0, 1 +; RV32I-NEXT: sll a6, t1, a6 +; RV32I-NEXT: or a6, a5, a6 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) -; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: srli a0, a6, 16 +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: srli a0, a6, 24 +; RV32I-NEXT: sb a0, 11(a2) +; RV32I-NEXT: srli a0, a6, 8 +; RV32I-NEXT: sb a0, 9(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: sb a0, 2(a2) +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a0, 3(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: srli a0, t0, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: srli a0, t0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 @@ -823,6 +842,222 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { store i128 %res, ptr %dst, align 1 ret void } + +define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_16bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 9(a0) +; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a6, 11(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 13(a0) +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 14(a0) +; RV64I-NEXT: lbu a7, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a1) +; RV64I-NEXT: lbu a5, 4(a1) +; RV64I-NEXT: lbu a6, 6(a1) +; RV64I-NEXT: lbu a7, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: slli a4, a4, 37 +; RV64I-NEXT: or a5, a4, a1 +; RV64I-NEXT: addi a4, a5, -64 +; RV64I-NEXT: srl a1, a3, a5 +; RV64I-NEXT: bltz a4, .LBB7_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: j .LBB7_3 +; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: lbu a6, 1(a0) +; RV64I-NEXT: lbu a7, 0(a0) +; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a0) +; RV64I-NEXT: lbu t0, 4(a0) +; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: srl a0, a0, a5 +; RV64I-NEXT: not a5, a5 +; RV64I-NEXT: slli a3, a3, 1 +; RV64I-NEXT: sll a3, a3, a5 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: .LBB7_3: +; RV64I-NEXT: srai a4, a4, 63 +; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 15(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 14(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 12(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 10(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 1(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_16bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 8(a0) +; RV32I-NEXT: lw a3, 12(a0) +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: sb a1, 8(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a5, a1, 16 +; RV32I-NEXT: sb a5, 10(a2) +; RV32I-NEXT: srli a5, a1, 24 +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a3, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a1, a4, 16 +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: srli a1, a4, 24 +; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 1(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: ret + %src = load i128, ptr %src.ptr, align 1 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 + %res = lshr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_16bytes: ; RV64I: # %bb.0: @@ -873,11 +1108,11 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, a4, a1 ; RV64I-NEXT: addi a4, a5, -64 ; RV64I-NEXT: sll a1, a3, a5 -; RV64I-NEXT: bltz a4, .LBB7_2 +; RV64I-NEXT: bltz a4, .LBB8_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: j .LBB7_3 -; RV64I-NEXT: .LBB7_2: +; RV64I-NEXT: j .LBB8_3 +; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: lbu a6, 9(a0) ; RV64I-NEXT: lbu a7, 8(a0) ; RV64I-NEXT: lbu t0, 10(a0) @@ -905,7 +1140,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli a3, a3, 1 ; RV64I-NEXT: srl a3, a3, a5 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: .LBB7_3: +; RV64I-NEXT: .LBB8_3: ; RV64I-NEXT: srai a4, a4, 63 ; RV64I-NEXT: and a1, a4, a1 ; RV64I-NEXT: sb a1, 0(a2) @@ -942,98 +1177,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) ; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb zero, 19(sp) -; RV32I-NEXT: sb zero, 18(sp) -; RV32I-NEXT: sb zero, 17(sp) -; RV32I-NEXT: sb zero, 16(sp) -; RV32I-NEXT: sb zero, 15(sp) -; RV32I-NEXT: sb zero, 14(sp) -; RV32I-NEXT: sb zero, 13(sp) -; RV32I-NEXT: sb zero, 12(sp) -; RV32I-NEXT: sb zero, 11(sp) -; RV32I-NEXT: sb zero, 10(sp) -; RV32I-NEXT: sb zero, 9(sp) -; RV32I-NEXT: sb zero, 8(sp) -; RV32I-NEXT: sb zero, 7(sp) -; RV32I-NEXT: sb zero, 6(sp) -; RV32I-NEXT: sb zero, 5(sp) -; RV32I-NEXT: sb zero, 4(sp) -; RV32I-NEXT: sb a0, 35(sp) -; RV32I-NEXT: sb s2, 34(sp) -; RV32I-NEXT: sb s1, 33(sp) -; RV32I-NEXT: sb s0, 32(sp) -; RV32I-NEXT: sb t6, 31(sp) -; RV32I-NEXT: sb t5, 30(sp) -; RV32I-NEXT: sb t4, 29(sp) -; RV32I-NEXT: sb t3, 28(sp) -; RV32I-NEXT: sb t2, 27(sp) -; RV32I-NEXT: sb t1, 26(sp) -; RV32I-NEXT: sb t0, 25(sp) -; RV32I-NEXT: sb a7, 24(sp) -; RV32I-NEXT: sb a6, 23(sp) -; RV32I-NEXT: sb a5, 22(sp) -; RV32I-NEXT: sb a4, 21(sp) -; RV32I-NEXT: sb a3, 20(sp) -; RV32I-NEXT: andi a1, a1, 15 -; RV32I-NEXT: addi a0, sp, 20 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a4, 20(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: andi a0, a1, 12 +; RV32I-NEXT: addi a3, sp, 16 +; RV32I-NEXT: sub a3, a3, a0 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: lw a4, 0(a3) +; RV32I-NEXT: sll a5, a0, a1 +; RV32I-NEXT: andi a6, a1, 24 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: srli a7, a4, 1 +; RV32I-NEXT: lw t0, 12(a3) +; RV32I-NEXT: lw a3, 8(a3) +; RV32I-NEXT: srl a7, a7, a6 +; RV32I-NEXT: or a7, a5, a7 +; RV32I-NEXT: sll t0, t0, a1 +; RV32I-NEXT: srli t1, a3, 1 +; RV32I-NEXT: srl t1, t1, a6 +; RV32I-NEXT: or t1, t0, t1 +; RV32I-NEXT: sll a3, a3, a1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: srl a0, a0, a6 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: sll a1, a4, a1 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, t0, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a5, a5, 24 +; RV32I-NEXT: sb a5, 7(a2) +; RV32I-NEXT: sb a0, 8(a2) +; RV32I-NEXT: sb t1, 12(a2) +; RV32I-NEXT: sb a7, 4(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 10(a2) +; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) -; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: srli a0, t1, 16 +; RV32I-NEXT: sb a0, 14(a2) +; RV32I-NEXT: srli a0, t1, 8 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: srli a0, a7, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, a7, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 @@ -1042,6 +1296,223 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { store i128 %res, ptr %dst, align 1 ret void } + +define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_16bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a1) +; RV64I-NEXT: lbu a5, 4(a1) +; RV64I-NEXT: lbu a6, 6(a1) +; RV64I-NEXT: lbu a7, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 1(a1) +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: or a1, a1, a5 +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: slli a4, a4, 37 +; RV64I-NEXT: or a5, a4, a1 +; RV64I-NEXT: addi a4, a5, -64 +; RV64I-NEXT: sll a1, a3, a5 +; RV64I-NEXT: bltz a4, .LBB9_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: j .LBB9_3 +; RV64I-NEXT: .LBB9_2: +; RV64I-NEXT: lbu a6, 9(a0) +; RV64I-NEXT: lbu a7, 8(a0) +; RV64I-NEXT: lbu t0, 10(a0) +; RV64I-NEXT: lbu t1, 11(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 13(a0) +; RV64I-NEXT: lbu t0, 12(a0) +; RV64I-NEXT: lbu t1, 14(a0) +; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: sll a0, a0, a5 +; RV64I-NEXT: not a5, a5 +; RV64I-NEXT: srli a3, a3, 1 +; RV64I-NEXT: srl a3, a3, a5 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: .LBB9_3: +; RV64I-NEXT: srai a4, a4, 63 +; RV64I-NEXT: and a1, a4, a1 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 7(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_16bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a4, 20(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: lw a1, 8(a0) +; RV32I-NEXT: lw a3, 12(a0) +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: sb a1, 8(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a5, a1, 16 +; RV32I-NEXT: sb a5, 10(a2) +; RV32I-NEXT: srli a5, a1, 24 +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a3, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a1, a4, 16 +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: srli a1, a4, 24 +; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 1(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: ret + %src = load i128, ptr %src.ptr, align 1 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 + %res = shl i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + + define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_16bytes: ; RV64I: # %bb.0: @@ -1092,13 +1563,13 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, a5, a1 ; RV64I-NEXT: addi a6, a5, -64 ; RV64I-NEXT: sra a1, a3, a5 -; RV64I-NEXT: bltz a6, .LBB8_2 +; RV64I-NEXT: bltz a6, .LBB10_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: sraiw a3, a4, 31 ; RV64I-NEXT: mv a0, a1 ; RV64I-NEXT: mv a1, a3 -; RV64I-NEXT: j .LBB8_3 -; RV64I-NEXT: .LBB8_2: +; RV64I-NEXT: j .LBB10_3 +; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a6, 0(a0) ; RV64I-NEXT: lbu a7, 2(a0) @@ -1126,7 +1597,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: slli a3, a3, 1 ; RV64I-NEXT: sll a3, a3, a4 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: .LBB8_3: +; RV64I-NEXT: .LBB10_3: ; RV64I-NEXT: sb a1, 8(a2) ; RV64I-NEXT: srli a3, a1, 56 ; RV64I-NEXT: sb a3, 15(a2) @@ -1161,105 +1632,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw s0, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 15(a0) -; RV32I-NEXT: slli a4, a3, 24 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 2(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t2, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t5, 8(a0) -; RV32I-NEXT: lbu t6, 9(a0) -; RV32I-NEXT: lbu s0, 10(a0) -; RV32I-NEXT: lbu s1, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 14(a0) -; RV32I-NEXT: lbu a0, 13(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 15(sp) -; RV32I-NEXT: sb s3, 14(sp) -; RV32I-NEXT: sb a0, 13(sp) -; RV32I-NEXT: sb s2, 12(sp) -; RV32I-NEXT: sb s1, 11(sp) -; RV32I-NEXT: sb s0, 10(sp) -; RV32I-NEXT: sb t6, 9(sp) -; RV32I-NEXT: sb t5, 8(sp) -; RV32I-NEXT: sb t4, 7(sp) -; RV32I-NEXT: sb t3, 6(sp) -; RV32I-NEXT: sb t2, 5(sp) -; RV32I-NEXT: sb t1, 4(sp) -; RV32I-NEXT: sb t0, 3(sp) -; RV32I-NEXT: sb a7, 2(sp) -; RV32I-NEXT: sb a6, 1(sp) -; RV32I-NEXT: sb a5, 0(sp) -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: sb a4, 28(sp) -; RV32I-NEXT: sb a4, 24(sp) -; RV32I-NEXT: sb a4, 20(sp) -; RV32I-NEXT: sb a4, 16(sp) -; RV32I-NEXT: srli a0, a4, 24 -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: srli a3, a4, 16 -; RV32I-NEXT: sb a3, 30(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a0, 27(sp) -; RV32I-NEXT: sb a3, 26(sp) -; RV32I-NEXT: sb a4, 25(sp) -; RV32I-NEXT: sb a0, 23(sp) -; RV32I-NEXT: sb a3, 22(sp) -; RV32I-NEXT: sb a4, 21(sp) -; RV32I-NEXT: sb a0, 19(sp) -; RV32I-NEXT: sb a3, 18(sp) -; RV32I-NEXT: sb a4, 17(sp) -; RV32I-NEXT: andi a1, a1, 15 -; RV32I-NEXT: mv a0, sp -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a7, a0, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw a0, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: andi a0, a1, 12 +; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: srl a4, a3, a1 +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: andi a6, a1, 24 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: lw a7, 0(a0) +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or t0, a4, t0 +; RV32I-NEXT: srl a7, a7, a1 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: sll a3, a3, a6 +; RV32I-NEXT: or a3, a7, a3 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli t1, a0, 1 +; RV32I-NEXT: sll a6, t1, a6 +; RV32I-NEXT: or a6, a5, a6 +; RV32I-NEXT: sra a0, a0, a1 +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) -; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 32(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: srli a0, a6, 16 +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: srli a0, a6, 24 +; RV32I-NEXT: sb a0, 11(a2) +; RV32I-NEXT: srli a0, a6, 8 +; RV32I-NEXT: sb a0, 9(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: sb a0, 2(a2) +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a0, 3(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: srli a0, t0, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: srli a0, t0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 @@ -1269,441 +1753,645 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ret void } +define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_16bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: lbu a3, 9(a0) +; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a6, 11(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 13(a0) +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 14(a0) +; RV64I-NEXT: lbu a7, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: lbu a5, 5(a1) +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: lbu t0, 7(a1) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu a1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: slli a1, a1, 5 +; RV64I-NEXT: slli a5, a5, 37 +; RV64I-NEXT: or a5, a5, a1 +; RV64I-NEXT: addi a6, a5, -64 +; RV64I-NEXT: sra a1, a3, a5 +; RV64I-NEXT: bltz a6, .LBB11_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sraiw a3, a4, 31 +; RV64I-NEXT: mv a0, a1 +; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: j .LBB11_3 +; RV64I-NEXT: .LBB11_2: +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a6, 0(a0) +; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: lbu a6, 5(a0) +; RV64I-NEXT: lbu a7, 4(a0) +; RV64I-NEXT: lbu t0, 6(a0) +; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a4 +; RV64I-NEXT: srl a0, a0, a5 +; RV64I-NEXT: not a4, a5 +; RV64I-NEXT: slli a3, a3, 1 +; RV64I-NEXT: sll a3, a3, a4 +; RV64I-NEXT: or a0, a0, a3 +; RV64I-NEXT: .LBB11_3: +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 15(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 14(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 13(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 12(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 10(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a0, 0(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 1(a2) +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_16bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a7, a0, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw a0, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 12 +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lw a1, 8(a0) +; RV32I-NEXT: lw a3, 12(a0) +; RV32I-NEXT: lw a4, 0(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: sb a1, 8(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a5, a1, 16 +; RV32I-NEXT: sb a5, 10(a2) +; RV32I-NEXT: srli a5, a1, 24 +; RV32I-NEXT: sb a5, 11(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 9(a2) +; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a3, 24 +; RV32I-NEXT: sb a1, 15(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a1, a4, 16 +; RV32I-NEXT: sb a1, 2(a2) +; RV32I-NEXT: srli a1, a4, 24 +; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 1(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: ret + %src = load i128, ptr %src.ptr, align 1 + %wordOff = load i128, ptr %wordOff.ptr, align 1 + %bitOff = shl i128 %wordOff, 5 + %res = ashr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -224 -; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: lbu ra, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu a6, 27(a0) -; RV64I-NEXT: lbu a5, 28(a0) -; RV64I-NEXT: lbu a3, 31(a0) -; RV64I-NEXT: lbu a4, 30(a0) -; RV64I-NEXT: lbu a0, 29(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: sb a3, 87(sp) -; RV64I-NEXT: sb a4, 86(sp) -; RV64I-NEXT: sb a0, 85(sp) -; RV64I-NEXT: sb a5, 84(sp) -; RV64I-NEXT: sb a6, 83(sp) -; RV64I-NEXT: sb a7, 82(sp) -; RV64I-NEXT: sb zero, 119(sp) -; RV64I-NEXT: sb zero, 118(sp) -; RV64I-NEXT: sb zero, 117(sp) -; RV64I-NEXT: sb zero, 116(sp) -; RV64I-NEXT: sb zero, 115(sp) -; RV64I-NEXT: sb zero, 114(sp) -; RV64I-NEXT: sb zero, 113(sp) -; RV64I-NEXT: sb zero, 112(sp) -; RV64I-NEXT: sb zero, 111(sp) -; RV64I-NEXT: sb zero, 110(sp) -; RV64I-NEXT: sb zero, 109(sp) -; RV64I-NEXT: sb zero, 108(sp) -; RV64I-NEXT: sb zero, 107(sp) -; RV64I-NEXT: sb zero, 106(sp) -; RV64I-NEXT: sb zero, 105(sp) -; RV64I-NEXT: sb zero, 104(sp) -; RV64I-NEXT: sb zero, 103(sp) -; RV64I-NEXT: sb zero, 102(sp) -; RV64I-NEXT: sb zero, 101(sp) -; RV64I-NEXT: sb zero, 100(sp) -; RV64I-NEXT: sb zero, 99(sp) -; RV64I-NEXT: sb zero, 98(sp) -; RV64I-NEXT: sb zero, 97(sp) -; RV64I-NEXT: sb zero, 96(sp) -; RV64I-NEXT: sb zero, 95(sp) -; RV64I-NEXT: sb zero, 94(sp) -; RV64I-NEXT: sb zero, 93(sp) -; RV64I-NEXT: sb zero, 92(sp) -; RV64I-NEXT: sb zero, 91(sp) -; RV64I-NEXT: sb zero, 90(sp) -; RV64I-NEXT: sb zero, 89(sp) -; RV64I-NEXT: sb zero, 88(sp) -; RV64I-NEXT: sb t0, 81(sp) -; RV64I-NEXT: sb ra, 80(sp) -; RV64I-NEXT: sb s11, 79(sp) -; RV64I-NEXT: sb s10, 78(sp) -; RV64I-NEXT: sb s9, 77(sp) -; RV64I-NEXT: sb s8, 76(sp) -; RV64I-NEXT: sb s7, 75(sp) -; RV64I-NEXT: sb s6, 74(sp) -; RV64I-NEXT: sb s5, 73(sp) -; RV64I-NEXT: sb s4, 72(sp) -; RV64I-NEXT: sb s3, 71(sp) -; RV64I-NEXT: sb s2, 70(sp) -; RV64I-NEXT: sb s1, 69(sp) -; RV64I-NEXT: sb s0, 68(sp) -; RV64I-NEXT: sb t6, 67(sp) -; RV64I-NEXT: sb t5, 66(sp) -; RV64I-NEXT: sb t4, 65(sp) -; RV64I-NEXT: sb t3, 64(sp) -; RV64I-NEXT: sb t2, 63(sp) -; RV64I-NEXT: sb t1, 62(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 61(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 60(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 59(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 58(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 57(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: andi a1, a1, 31 -; RV64I-NEXT: addi a0, sp, 56 -; RV64I-NEXT: add a6, a0, a1 -; RV64I-NEXT: lbu a0, 8(a6) -; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) -; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) -; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) -; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) -; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) -; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: sd zero, 48(sp) +; RV64I-NEXT: sd zero, 40(sp) +; RV64I-NEXT: sd zero, 32(sp) +; RV64I-NEXT: sd a0, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: andi a0, a1, 24 +; RV64I-NEXT: mv a3, sp +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: srl a5, a4, a1 +; RV64I-NEXT: ld a6, 16(a3) +; RV64I-NEXT: andi a0, a1, 56 +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 0(a3) +; RV64I-NEXT: slli a0, a6, 1 +; RV64I-NEXT: sll a0, a0, a7 +; RV64I-NEXT: or a0, a5, a0 +; RV64I-NEXT: srl t0, t0, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sll a4, a4, a7 +; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli t1, a3, 1 +; RV64I-NEXT: sll a7, t1, a7 +; RV64I-NEXT: or a7, a6, a7 +; RV64I-NEXT: srl a1, a3, a1 +; RV64I-NEXT: sb a6, 16(a2) +; RV64I-NEXT: sb a1, 24(a2) +; RV64I-NEXT: sb t0, 0(a2) +; RV64I-NEXT: sb a5, 8(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 31(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 30(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 27(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 26(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: srli a1, a7, 56 +; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: srli a1, a7, 48 ; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: srli a1, a7, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a7, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a7, 24 +; RV64I-NEXT: sb a1, 19(a2) +; RV64I-NEXT: srli a1, a7, 16 +; RV64I-NEXT: sb a1, 18(a2) +; RV64I-NEXT: srli a1, a7, 8 +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 9(a2) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 224 +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -144 -; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -80 +; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: lbu ra, 24(a0) -; RV32I-NEXT: lbu t0, 25(a0) -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu a6, 27(a0) -; RV32I-NEXT: lbu a5, 28(a0) -; RV32I-NEXT: lbu a3, 31(a0) -; RV32I-NEXT: lbu a4, 30(a0) -; RV32I-NEXT: lbu a0, 29(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 59(sp) -; RV32I-NEXT: sb a4, 58(sp) -; RV32I-NEXT: sb a0, 57(sp) -; RV32I-NEXT: sb a5, 56(sp) -; RV32I-NEXT: sb a6, 55(sp) -; RV32I-NEXT: sb a7, 54(sp) -; RV32I-NEXT: sb zero, 91(sp) -; RV32I-NEXT: sb zero, 90(sp) -; RV32I-NEXT: sb zero, 89(sp) -; RV32I-NEXT: sb zero, 88(sp) -; RV32I-NEXT: sb zero, 87(sp) -; RV32I-NEXT: sb zero, 86(sp) -; RV32I-NEXT: sb zero, 85(sp) -; RV32I-NEXT: sb zero, 84(sp) -; RV32I-NEXT: sb zero, 83(sp) -; RV32I-NEXT: sb zero, 82(sp) -; RV32I-NEXT: sb zero, 81(sp) -; RV32I-NEXT: sb zero, 80(sp) -; RV32I-NEXT: sb zero, 79(sp) -; RV32I-NEXT: sb zero, 78(sp) -; RV32I-NEXT: sb zero, 77(sp) -; RV32I-NEXT: sb zero, 76(sp) -; RV32I-NEXT: sb zero, 75(sp) -; RV32I-NEXT: sb zero, 74(sp) -; RV32I-NEXT: sb zero, 73(sp) -; RV32I-NEXT: sb zero, 72(sp) -; RV32I-NEXT: sb zero, 71(sp) -; RV32I-NEXT: sb zero, 70(sp) -; RV32I-NEXT: sb zero, 69(sp) -; RV32I-NEXT: sb zero, 68(sp) -; RV32I-NEXT: sb zero, 67(sp) -; RV32I-NEXT: sb zero, 66(sp) -; RV32I-NEXT: sb zero, 65(sp) -; RV32I-NEXT: sb zero, 64(sp) -; RV32I-NEXT: sb zero, 63(sp) -; RV32I-NEXT: sb zero, 62(sp) -; RV32I-NEXT: sb zero, 61(sp) -; RV32I-NEXT: sb zero, 60(sp) -; RV32I-NEXT: sb t0, 53(sp) -; RV32I-NEXT: sb ra, 52(sp) -; RV32I-NEXT: sb s11, 51(sp) -; RV32I-NEXT: sb s10, 50(sp) -; RV32I-NEXT: sb s9, 49(sp) -; RV32I-NEXT: sb s8, 48(sp) -; RV32I-NEXT: sb s7, 47(sp) -; RV32I-NEXT: sb s6, 46(sp) -; RV32I-NEXT: sb s5, 45(sp) -; RV32I-NEXT: sb s4, 44(sp) -; RV32I-NEXT: sb s3, 43(sp) -; RV32I-NEXT: sb s2, 42(sp) -; RV32I-NEXT: sb s1, 41(sp) -; RV32I-NEXT: sb s0, 40(sp) -; RV32I-NEXT: sb t6, 39(sp) -; RV32I-NEXT: sb t5, 38(sp) -; RV32I-NEXT: sb t4, 37(sp) -; RV32I-NEXT: sb t3, 36(sp) -; RV32I-NEXT: sb t2, 35(sp) -; RV32I-NEXT: sb t1, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: andi a1, a1, 31 -; RV32I-NEXT: addi a0, sp, 28 -; RV32I-NEXT: add a6, a0, a1 -; RV32I-NEXT: lbu a0, 6(a6) -; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) -; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) -; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) -; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) -; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t2, 1(a1) +; RV32I-NEXT: lbu t3, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: sw zero, 60(sp) +; RV32I-NEXT: sw zero, 56(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 40(sp) +; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: andi a0, a1, 28 +; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: add a5, a3, a0 +; RV32I-NEXT: lw a3, 4(a5) +; RV32I-NEXT: slli a6, a1, 3 +; RV32I-NEXT: srl a4, a3, a6 +; RV32I-NEXT: lw a7, 8(a5) +; RV32I-NEXT: andi a0, a6, 24 +; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: lw a1, 0(a5) +; RV32I-NEXT: slli a0, a7, 1 +; RV32I-NEXT: sll a0, a0, t0 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: srl t1, a1, a6 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: lw t2, 12(a5) +; RV32I-NEXT: lw t3, 16(a5) +; RV32I-NEXT: sll a1, a3, t0 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: srl t4, t2, a6 +; RV32I-NEXT: slli a3, t3, 1 +; RV32I-NEXT: sll a3, a3, t0 +; RV32I-NEXT: or a3, t4, a3 +; RV32I-NEXT: srl a7, a7, a6 +; RV32I-NEXT: slli t2, t2, 1 +; RV32I-NEXT: lw t5, 20(a5) +; RV32I-NEXT: lw t6, 24(a5) +; RV32I-NEXT: sll t2, t2, t0 +; RV32I-NEXT: or t2, a7, t2 +; RV32I-NEXT: srl s0, t5, a6 +; RV32I-NEXT: slli s1, t6, 1 +; RV32I-NEXT: sll s1, s1, t0 +; RV32I-NEXT: or s1, s0, s1 +; RV32I-NEXT: srl t3, t3, a6 +; RV32I-NEXT: slli t5, t5, 1 +; RV32I-NEXT: lw a5, 28(a5) +; RV32I-NEXT: sll t5, t5, t0 +; RV32I-NEXT: or t5, t3, t5 +; RV32I-NEXT: srl t6, t6, a6 +; RV32I-NEXT: slli s2, a5, 1 +; RV32I-NEXT: sll t0, s2, t0 +; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: srl a5, a5, a6 +; RV32I-NEXT: sb t6, 24(a2) ; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) +; RV32I-NEXT: sb t3, 16(a2) +; RV32I-NEXT: sb s0, 20(a2) +; RV32I-NEXT: sb a7, 8(a2) ; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb t1, 0(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a4, a5, 24 +; RV32I-NEXT: sb a4, 31(a2) +; RV32I-NEXT: srli a4, a5, 16 +; RV32I-NEXT: sb a4, 30(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 29(a2) +; RV32I-NEXT: srli a4, t0, 24 +; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a4, t0, 16 +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: srli a4, t0, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t5, 24 +; RV32I-NEXT: sb a4, 19(a2) +; RV32I-NEXT: srli a4, t5, 16 +; RV32I-NEXT: sb a4, 18(a2) +; RV32I-NEXT: srli a4, t5, 8 +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: srli a4, s1, 24 +; RV32I-NEXT: sb a4, 23(a2) +; RV32I-NEXT: srli a4, s1, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: srli s1, s1, 8 +; RV32I-NEXT: sb s1, 21(a2) +; RV32I-NEXT: srli a4, t2, 24 +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a4, t2, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a4, t2, 8 +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 144 +; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 80 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -1712,441 +2400,1167 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { store i256 %res, ptr %dst, align 1 ret void } -define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; RV64I-LABEL: shl_32bytes: + +define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_32bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -224 -; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) -; RV64I-NEXT: lbu ra, 24(a0) -; RV64I-NEXT: lbu t0, 25(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu a6, 27(a0) -; RV64I-NEXT: lbu a5, 28(a0) -; RV64I-NEXT: lbu a3, 31(a0) -; RV64I-NEXT: lbu a4, 30(a0) -; RV64I-NEXT: lbu a0, 29(a0) -; RV64I-NEXT: lbu a1, 0(a1) -; RV64I-NEXT: sb a3, 119(sp) -; RV64I-NEXT: sb a4, 118(sp) -; RV64I-NEXT: sb a0, 117(sp) -; RV64I-NEXT: sb a5, 116(sp) -; RV64I-NEXT: sb a6, 115(sp) -; RV64I-NEXT: sb a7, 114(sp) -; RV64I-NEXT: sb zero, 87(sp) -; RV64I-NEXT: sb zero, 86(sp) -; RV64I-NEXT: sb zero, 85(sp) -; RV64I-NEXT: sb zero, 84(sp) -; RV64I-NEXT: sb zero, 83(sp) -; RV64I-NEXT: sb zero, 82(sp) -; RV64I-NEXT: sb zero, 81(sp) -; RV64I-NEXT: sb zero, 80(sp) -; RV64I-NEXT: sb zero, 79(sp) -; RV64I-NEXT: sb zero, 78(sp) -; RV64I-NEXT: sb zero, 77(sp) -; RV64I-NEXT: sb zero, 76(sp) -; RV64I-NEXT: sb zero, 75(sp) -; RV64I-NEXT: sb zero, 74(sp) -; RV64I-NEXT: sb zero, 73(sp) -; RV64I-NEXT: sb zero, 72(sp) -; RV64I-NEXT: sb zero, 71(sp) -; RV64I-NEXT: sb zero, 70(sp) -; RV64I-NEXT: sb zero, 69(sp) -; RV64I-NEXT: sb zero, 68(sp) -; RV64I-NEXT: sb zero, 67(sp) -; RV64I-NEXT: sb zero, 66(sp) -; RV64I-NEXT: sb zero, 65(sp) -; RV64I-NEXT: sb zero, 64(sp) -; RV64I-NEXT: sb zero, 63(sp) -; RV64I-NEXT: sb zero, 62(sp) -; RV64I-NEXT: sb zero, 61(sp) -; RV64I-NEXT: sb zero, 60(sp) -; RV64I-NEXT: sb zero, 59(sp) -; RV64I-NEXT: sb zero, 58(sp) -; RV64I-NEXT: sb zero, 57(sp) -; RV64I-NEXT: sb zero, 56(sp) -; RV64I-NEXT: sb t0, 113(sp) -; RV64I-NEXT: sb ra, 112(sp) -; RV64I-NEXT: sb s11, 111(sp) -; RV64I-NEXT: sb s10, 110(sp) -; RV64I-NEXT: sb s9, 109(sp) -; RV64I-NEXT: sb s8, 108(sp) -; RV64I-NEXT: sb s7, 107(sp) -; RV64I-NEXT: sb s6, 106(sp) -; RV64I-NEXT: sb s5, 105(sp) -; RV64I-NEXT: sb s4, 104(sp) -; RV64I-NEXT: sb s3, 103(sp) -; RV64I-NEXT: sb s2, 102(sp) -; RV64I-NEXT: sb s1, 101(sp) -; RV64I-NEXT: sb s0, 100(sp) -; RV64I-NEXT: sb t6, 99(sp) -; RV64I-NEXT: sb t5, 98(sp) -; RV64I-NEXT: sb t4, 97(sp) -; RV64I-NEXT: sb t3, 96(sp) -; RV64I-NEXT: sb t2, 95(sp) -; RV64I-NEXT: sb t1, 94(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 93(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 92(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 91(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 90(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 88(sp) -; RV64I-NEXT: andi a1, a1, 31 -; RV64I-NEXT: addi a0, sp, 88 -; RV64I-NEXT: sub a6, a0, a1 -; RV64I-NEXT: lbu a0, 8(a6) -; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) -; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) -; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) -; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) -; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: sd zero, 48(sp) +; RV64I-NEXT: sd zero, 40(sp) +; RV64I-NEXT: sd zero, 32(sp) +; RV64I-NEXT: sd a0, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: slli a0, a1, 2 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: mv a3, sp +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: slli a5, a1, 5 +; RV64I-NEXT: srl a1, a4, a5 +; RV64I-NEXT: ld a6, 16(a3) +; RV64I-NEXT: andi a0, a5, 32 +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 0(a3) +; RV64I-NEXT: slli a0, a6, 1 +; RV64I-NEXT: sll a0, a0, a7 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srl t0, t0, a5 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sll a4, a4, a7 +; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: srl a6, a6, a5 +; RV64I-NEXT: slli t1, a3, 1 +; RV64I-NEXT: sll a7, t1, a7 +; RV64I-NEXT: or a7, a6, a7 +; RV64I-NEXT: srl a3, a3, a5 +; RV64I-NEXT: sb a6, 16(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb t0, 0(a2) +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: srli a5, a6, 24 +; RV64I-NEXT: sb a5, 19(a2) +; RV64I-NEXT: srli a5, a6, 16 ; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: srli a5, a6, 8 +; RV64I-NEXT: sb a5, 17(a2) +; RV64I-NEXT: srli a5, a3, 56 +; RV64I-NEXT: sb a5, 31(a2) +; RV64I-NEXT: srli a5, a3, 48 +; RV64I-NEXT: sb a5, 30(a2) +; RV64I-NEXT: srli a5, a3, 40 +; RV64I-NEXT: sb a5, 29(a2) +; RV64I-NEXT: srli a5, a3, 32 +; RV64I-NEXT: sb a5, 28(a2) +; RV64I-NEXT: srli a5, a3, 24 +; RV64I-NEXT: sb a5, 27(a2) +; RV64I-NEXT: srli a5, a3, 16 +; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a3, t0, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, t0, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: srli a3, t0, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 10(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: srli a1, a7, 56 +; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: srli a1, a7, 48 +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: srli a1, a7, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a7, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a4, a4, 32 +; RV64I-NEXT: sb a4, 4(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a0, a0, 32 ; RV64I-NEXT: sb a0, 12(a2) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_32bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 60(sp) +; RV32I-NEXT: sw zero, 56(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 40(sp) +; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: add a3, a0, a1 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a1, 0(a3) +; RV32I-NEXT: lw a4, 12(a3) +; RV32I-NEXT: lw a5, 8(a3) +; RV32I-NEXT: lw a6, 24(a3) +; RV32I-NEXT: lw a7, 28(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw a3, 20(a3) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb a3, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli t1, a6, 24 +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 25(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 29(a2) +; RV32I-NEXT: srli a6, t0, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: sb a6, 18(a2) +; RV32I-NEXT: srli a6, t0, 8 +; RV32I-NEXT: sb a6, 17(a2) +; RV32I-NEXT: srli a6, a3, 24 +; RV32I-NEXT: sb a6, 23(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: lshr_32bytes_dwordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: sd zero, 48(sp) +; RV64I-NEXT: sd zero, 40(sp) +; RV64I-NEXT: sd zero, 32(sp) +; RV64I-NEXT: sd a0, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: mv a0, sp +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a1, 16(a0) +; RV64I-NEXT: ld a3, 24(a0) +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a0, 8(a0) +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: sb a5, 23(a2) +; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: sb a5, 22(a2) +; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: sb a5, 21(a2) +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: sb a5, 19(a2) +; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: srli a1, a3, 56 +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a3, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, a3, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, a3, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, a3, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, a3, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 9(a2) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV32I-LABEL: lshr_32bytes_dwordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 60(sp) +; RV32I-NEXT: sw zero, 56(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 40(sp) +; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: andi a1, a1, 24 +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: add a3, a0, a1 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a1, 0(a3) +; RV32I-NEXT: lw a4, 12(a3) +; RV32I-NEXT: lw a5, 8(a3) +; RV32I-NEXT: lw a6, 24(a3) +; RV32I-NEXT: lw a7, 28(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw a3, 20(a3) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb a3, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli t1, a6, 24 +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 25(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 29(a2) +; RV32I-NEXT: srli a6, t0, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: sb a6, 18(a2) +; RV32I-NEXT: srli a6, t0, 8 +; RV32I-NEXT: sb a6, 17(a2) +; RV32I-NEXT: srli a6, a3, 24 +; RV32I-NEXT: sb a6, 23(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_32bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: sd zero, 8(sp) +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a5, 48(sp) +; RV64I-NEXT: sd a4, 40(sp) +; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: andi a0, a1, 24 +; RV64I-NEXT: addi a3, sp, 32 +; RV64I-NEXT: sub a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: ld a5, 0(a3) +; RV64I-NEXT: sll a6, a4, a1 +; RV64I-NEXT: andi a0, a1, 56 +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: srli a0, a5, 1 +; RV64I-NEXT: ld t0, 24(a3) +; RV64I-NEXT: ld a3, 16(a3) +; RV64I-NEXT: srl a0, a0, a7 +; RV64I-NEXT: or a0, a6, a0 +; RV64I-NEXT: sll t0, t0, a1 +; RV64I-NEXT: srli t1, a3, 1 +; RV64I-NEXT: srl t1, t1, a7 +; RV64I-NEXT: or t1, t0, t1 +; RV64I-NEXT: sll a3, a3, a1 +; RV64I-NEXT: srli a4, a4, 1 +; RV64I-NEXT: srl a4, a4, a7 +; RV64I-NEXT: or a4, a3, a4 +; RV64I-NEXT: sll a1, a5, a1 +; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: srli a3, a3, 56 +; RV64I-NEXT: sb a3, 23(a2) +; RV64I-NEXT: srli a3, t0, 56 +; RV64I-NEXT: sb a3, 31(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 7(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: srli a1, a6, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb t1, 24(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 224 +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 19(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 18(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 17(a2) +; RV64I-NEXT: srli a1, t1, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, t1, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, t1, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, t1, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, t1, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: srli a1, t1, 8 +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -144 -; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -80 +; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) -; RV32I-NEXT: lbu ra, 24(a0) -; RV32I-NEXT: lbu t0, 25(a0) -; RV32I-NEXT: lbu a7, 26(a0) -; RV32I-NEXT: lbu a6, 27(a0) -; RV32I-NEXT: lbu a5, 28(a0) -; RV32I-NEXT: lbu a3, 31(a0) -; RV32I-NEXT: lbu a4, 30(a0) -; RV32I-NEXT: lbu a0, 29(a0) -; RV32I-NEXT: lbu a1, 0(a1) -; RV32I-NEXT: sb a3, 91(sp) -; RV32I-NEXT: sb a4, 90(sp) -; RV32I-NEXT: sb a0, 89(sp) -; RV32I-NEXT: sb a5, 88(sp) -; RV32I-NEXT: sb a6, 87(sp) -; RV32I-NEXT: sb a7, 86(sp) -; RV32I-NEXT: sb zero, 59(sp) -; RV32I-NEXT: sb zero, 58(sp) -; RV32I-NEXT: sb zero, 57(sp) -; RV32I-NEXT: sb zero, 56(sp) -; RV32I-NEXT: sb zero, 55(sp) -; RV32I-NEXT: sb zero, 54(sp) -; RV32I-NEXT: sb zero, 53(sp) -; RV32I-NEXT: sb zero, 52(sp) -; RV32I-NEXT: sb zero, 51(sp) -; RV32I-NEXT: sb zero, 50(sp) -; RV32I-NEXT: sb zero, 49(sp) -; RV32I-NEXT: sb zero, 48(sp) -; RV32I-NEXT: sb zero, 47(sp) -; RV32I-NEXT: sb zero, 46(sp) -; RV32I-NEXT: sb zero, 45(sp) -; RV32I-NEXT: sb zero, 44(sp) -; RV32I-NEXT: sb zero, 43(sp) -; RV32I-NEXT: sb zero, 42(sp) -; RV32I-NEXT: sb zero, 41(sp) -; RV32I-NEXT: sb zero, 40(sp) -; RV32I-NEXT: sb zero, 39(sp) -; RV32I-NEXT: sb zero, 38(sp) -; RV32I-NEXT: sb zero, 37(sp) -; RV32I-NEXT: sb zero, 36(sp) -; RV32I-NEXT: sb zero, 35(sp) -; RV32I-NEXT: sb zero, 34(sp) -; RV32I-NEXT: sb zero, 33(sp) -; RV32I-NEXT: sb zero, 32(sp) -; RV32I-NEXT: sb zero, 31(sp) -; RV32I-NEXT: sb zero, 30(sp) -; RV32I-NEXT: sb zero, 29(sp) -; RV32I-NEXT: sb zero, 28(sp) -; RV32I-NEXT: sb t0, 85(sp) -; RV32I-NEXT: sb ra, 84(sp) -; RV32I-NEXT: sb s11, 83(sp) -; RV32I-NEXT: sb s10, 82(sp) -; RV32I-NEXT: sb s9, 81(sp) -; RV32I-NEXT: sb s8, 80(sp) -; RV32I-NEXT: sb s7, 79(sp) -; RV32I-NEXT: sb s6, 78(sp) -; RV32I-NEXT: sb s5, 77(sp) -; RV32I-NEXT: sb s4, 76(sp) -; RV32I-NEXT: sb s3, 75(sp) -; RV32I-NEXT: sb s2, 74(sp) -; RV32I-NEXT: sb s1, 73(sp) -; RV32I-NEXT: sb s0, 72(sp) -; RV32I-NEXT: sb t6, 71(sp) -; RV32I-NEXT: sb t5, 70(sp) -; RV32I-NEXT: sb t4, 69(sp) -; RV32I-NEXT: sb t3, 68(sp) -; RV32I-NEXT: sb t2, 67(sp) -; RV32I-NEXT: sb t1, 66(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 65(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 64(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 63(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 62(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 60(sp) -; RV32I-NEXT: andi a1, a1, 31 -; RV32I-NEXT: addi a0, sp, 60 -; RV32I-NEXT: sub a6, a0, a1 -; RV32I-NEXT: lbu a0, 6(a6) -; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) -; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) -; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) -; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) -; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu t2, 1(a1) +; RV32I-NEXT: lbu t3, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: andi a0, a1, 28 +; RV32I-NEXT: addi a3, sp, 32 +; RV32I-NEXT: sub a6, a3, a0 +; RV32I-NEXT: lw a3, 4(a6) +; RV32I-NEXT: slli a7, a1, 3 +; RV32I-NEXT: lw t0, 0(a6) +; RV32I-NEXT: sll a4, a3, a7 +; RV32I-NEXT: andi a0, a7, 24 +; RV32I-NEXT: xori t1, a0, 31 +; RV32I-NEXT: srli a0, t0, 1 +; RV32I-NEXT: lw t2, 12(a6) +; RV32I-NEXT: lw a5, 8(a6) +; RV32I-NEXT: srl a0, a0, t1 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: sll t3, t2, a7 +; RV32I-NEXT: srli a1, a5, 1 +; RV32I-NEXT: srl a1, a1, t1 +; RV32I-NEXT: or a1, t3, a1 +; RV32I-NEXT: sll t4, a5, a7 +; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: lw t5, 20(a6) +; RV32I-NEXT: lw t6, 16(a6) +; RV32I-NEXT: srl a3, a3, t1 +; RV32I-NEXT: or a3, t4, a3 +; RV32I-NEXT: sll s0, t5, a7 +; RV32I-NEXT: srli a5, t6, 1 +; RV32I-NEXT: srl a5, a5, t1 +; RV32I-NEXT: or a5, s0, a5 +; RV32I-NEXT: sll t6, t6, a7 +; RV32I-NEXT: srli t2, t2, 1 +; RV32I-NEXT: lw s1, 28(a6) +; RV32I-NEXT: lw a6, 24(a6) +; RV32I-NEXT: srl t2, t2, t1 +; RV32I-NEXT: or t2, t6, t2 +; RV32I-NEXT: sll s1, s1, a7 +; RV32I-NEXT: srli s2, a6, 1 +; RV32I-NEXT: srl s2, s2, t1 +; RV32I-NEXT: or s2, s1, s2 +; RV32I-NEXT: sll a6, a6, a7 +; RV32I-NEXT: srli t5, t5, 1 +; RV32I-NEXT: srl t1, t5, t1 +; RV32I-NEXT: or t1, a6, t1 +; RV32I-NEXT: sll a7, t0, a7 +; RV32I-NEXT: sb a7, 0(a2) +; RV32I-NEXT: srli a6, a6, 24 +; RV32I-NEXT: sb a6, 27(a2) +; RV32I-NEXT: srli s1, s1, 24 +; RV32I-NEXT: sb s1, 31(a2) +; RV32I-NEXT: srli a6, t6, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli s0, s0, 24 +; RV32I-NEXT: sb s0, 23(a2) +; RV32I-NEXT: srli a6, t4, 24 +; RV32I-NEXT: sb a6, 11(a2) +; RV32I-NEXT: srli a6, t3, 24 +; RV32I-NEXT: sb a6, 15(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 3(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 2(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 1(a2) +; RV32I-NEXT: srli a4, a4, 24 +; RV32I-NEXT: sb a4, 7(a2) +; RV32I-NEXT: sb t1, 24(a2) +; RV32I-NEXT: sb s2, 28(a2) +; RV32I-NEXT: sb t2, 16(a2) +; RV32I-NEXT: sb a5, 20(a2) +; RV32I-NEXT: sb a3, 8(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a4, t1, 16 +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: srli a4, t1, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, s2, 16 +; RV32I-NEXT: sb a4, 30(a2) +; RV32I-NEXT: srli a4, s2, 8 ; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: srli a4, t2, 16 +; RV32I-NEXT: sb a4, 18(a2) +; RV32I-NEXT: srli a4, t2, 8 +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: srli a4, a5, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 21(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 13(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 144 +; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 80 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -2155,457 +3569,1169 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { store i256 %res, ptr %dst, align 1 ret void } -define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; RV64I-LABEL: ashr_32bytes: + +define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_32bytes_wordOff: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -224 -; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv t0, a1 -; RV64I-NEXT: lbu t1, 31(a0) -; RV64I-NEXT: lbu a1, 0(a0) -; RV64I-NEXT: sd a1, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: sd a1, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 2(a0) -; RV64I-NEXT: sd a1, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 3(a0) -; RV64I-NEXT: sd a1, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 4(a0) -; RV64I-NEXT: sd a1, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a1, 5(a0) -; RV64I-NEXT: sd a1, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t2, 6(a0) -; RV64I-NEXT: lbu t3, 7(a0) -; RV64I-NEXT: lbu t4, 8(a0) -; RV64I-NEXT: lbu t5, 9(a0) -; RV64I-NEXT: lbu t6, 10(a0) -; RV64I-NEXT: lbu s0, 11(a0) -; RV64I-NEXT: lbu s1, 12(a0) -; RV64I-NEXT: lbu s2, 13(a0) -; RV64I-NEXT: lbu s3, 14(a0) -; RV64I-NEXT: lbu s4, 15(a0) -; RV64I-NEXT: lbu s5, 16(a0) -; RV64I-NEXT: lbu s6, 17(a0) -; RV64I-NEXT: lbu s7, 18(a0) -; RV64I-NEXT: lbu s8, 19(a0) -; RV64I-NEXT: lbu s9, 20(a0) -; RV64I-NEXT: lbu s10, 21(a0) -; RV64I-NEXT: lbu s11, 22(a0) -; RV64I-NEXT: lbu ra, 23(a0) +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) ; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: sd zero, 8(sp) +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a5, 48(sp) +; RV64I-NEXT: sd a4, 40(sp) +; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: slli a0, a1, 2 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: addi a3, sp, 32 +; RV64I-NEXT: sub a0, a3, a0 +; RV64I-NEXT: ld a4, 8(a0) +; RV64I-NEXT: slli a5, a1, 5 +; RV64I-NEXT: ld a6, 0(a0) +; RV64I-NEXT: sll a3, a4, a5 +; RV64I-NEXT: andi a1, a5, 32 +; RV64I-NEXT: xori a7, a1, 63 +; RV64I-NEXT: srli a1, a6, 1 +; RV64I-NEXT: ld t0, 24(a0) +; RV64I-NEXT: ld t1, 16(a0) +; RV64I-NEXT: srl a0, a1, a7 +; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: sll t0, t0, a5 +; RV64I-NEXT: srli a1, t1, 1 +; RV64I-NEXT: srl a1, a1, a7 +; RV64I-NEXT: or a1, t0, a1 +; RV64I-NEXT: sll t1, t1, a5 +; RV64I-NEXT: srli a4, a4, 1 +; RV64I-NEXT: srl a4, a4, a7 +; RV64I-NEXT: or a4, t1, a4 +; RV64I-NEXT: sll a5, a6, a5 +; RV64I-NEXT: sb a5, 0(a2) +; RV64I-NEXT: srli a6, t1, 56 +; RV64I-NEXT: sb a6, 23(a2) +; RV64I-NEXT: srli a6, t1, 48 +; RV64I-NEXT: sb a6, 22(a2) +; RV64I-NEXT: srli a6, t1, 40 +; RV64I-NEXT: sb a6, 21(a2) +; RV64I-NEXT: srli a6, t1, 32 +; RV64I-NEXT: sb a6, 20(a2) +; RV64I-NEXT: srli a6, t0, 56 +; RV64I-NEXT: sb a6, 31(a2) +; RV64I-NEXT: srli a6, t0, 48 +; RV64I-NEXT: sb a6, 30(a2) +; RV64I-NEXT: srli a6, t0, 40 +; RV64I-NEXT: sb a6, 29(a2) +; RV64I-NEXT: srli a6, t0, 32 +; RV64I-NEXT: sb a6, 28(a2) +; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: sb a6, 7(a2) +; RV64I-NEXT: srli a6, a5, 48 +; RV64I-NEXT: sb a6, 6(a2) +; RV64I-NEXT: srli a6, a5, 40 +; RV64I-NEXT: sb a6, 5(a2) +; RV64I-NEXT: srli a6, a5, 32 +; RV64I-NEXT: sb a6, 4(a2) +; RV64I-NEXT: srli a6, a5, 24 +; RV64I-NEXT: sb a6, 3(a2) +; RV64I-NEXT: srli a6, a5, 16 +; RV64I-NEXT: sb a6, 2(a2) +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: sb a5, 1(a2) +; RV64I-NEXT: srli a5, a3, 56 +; RV64I-NEXT: sb a5, 15(a2) +; RV64I-NEXT: srli a5, a3, 48 +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: srli a5, a3, 40 +; RV64I-NEXT: sb a5, 13(a2) +; RV64I-NEXT: srli a3, a3, 32 +; RV64I-NEXT: sb a3, 12(a2) +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb a1, 24(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a3, a4, 24 +; RV64I-NEXT: sb a3, 19(a2) +; RV64I-NEXT: srli a3, a4, 16 +; RV64I-NEXT: sb a3, 18(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 17(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 27(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 26(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV32I-LABEL: shl_32bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: sub a3, a0, a1 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a1, 0(a3) +; RV32I-NEXT: lw a4, 12(a3) +; RV32I-NEXT: lw a5, 8(a3) +; RV32I-NEXT: lw a6, 24(a3) +; RV32I-NEXT: lw a7, 28(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw a3, 20(a3) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb a3, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli t1, a6, 24 +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 25(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 29(a2) +; RV32I-NEXT: srli a6, t0, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: sb a6, 18(a2) +; RV32I-NEXT: srli a6, t0, 8 +; RV32I-NEXT: sb a6, 17(a2) +; RV32I-NEXT: srli a6, a3, 24 +; RV32I-NEXT: sb a6, 23(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: shl_32bytes_dwordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: lbu a6, 25(a0) -; RV64I-NEXT: lbu a5, 26(a0) -; RV64I-NEXT: lbu a4, 27(a0) -; RV64I-NEXT: lbu a1, 30(a0) -; RV64I-NEXT: lbu a3, 29(a0) -; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: lbu t0, 0(t0) -; RV64I-NEXT: sb a1, 86(sp) -; RV64I-NEXT: sb a3, 85(sp) -; RV64I-NEXT: sb a0, 84(sp) -; RV64I-NEXT: sb a4, 83(sp) -; RV64I-NEXT: sb a5, 82(sp) -; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb t1, 87(sp) -; RV64I-NEXT: slli t1, t1, 56 -; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb ra, 79(sp) -; RV64I-NEXT: sb s11, 78(sp) -; RV64I-NEXT: sb s10, 77(sp) -; RV64I-NEXT: sb s9, 76(sp) -; RV64I-NEXT: sb s8, 75(sp) -; RV64I-NEXT: sb s7, 74(sp) -; RV64I-NEXT: sb s6, 73(sp) -; RV64I-NEXT: sb s5, 72(sp) -; RV64I-NEXT: sb s4, 71(sp) -; RV64I-NEXT: sb s3, 70(sp) -; RV64I-NEXT: sb s2, 69(sp) -; RV64I-NEXT: sb s1, 68(sp) -; RV64I-NEXT: sb s0, 67(sp) -; RV64I-NEXT: sb t6, 66(sp) -; RV64I-NEXT: sb t5, 65(sp) -; RV64I-NEXT: sb t4, 64(sp) -; RV64I-NEXT: sb t3, 63(sp) -; RV64I-NEXT: sb t2, 62(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 61(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 60(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 59(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 58(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 57(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t1, 63 -; RV64I-NEXT: sb a0, 112(sp) -; RV64I-NEXT: sb a0, 104(sp) -; RV64I-NEXT: sb a0, 96(sp) -; RV64I-NEXT: sb a0, 88(sp) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: sd zero, 8(sp) +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a5, 48(sp) +; RV64I-NEXT: sd a4, 40(sp) +; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: addi a0, sp, 32 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ld a1, 16(a0) +; RV64I-NEXT: ld a3, 24(a0) +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a0, 8(a0) +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: sb a5, 23(a2) +; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: sb a5, 22(a2) +; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: sb a5, 21(a2) +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: sb a5, 19(a2) +; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: srli a1, a3, 56 +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a3, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, a3, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, a3, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, a3, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, a3, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 1(a2) ; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: sb a1, 119(sp) -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: sb a3, 118(sp) -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: sb a4, 117(sp) -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 116(sp) -; RV64I-NEXT: srli a6, a0, 24 -; RV64I-NEXT: sb a6, 115(sp) -; RV64I-NEXT: srli a7, a0, 16 -; RV64I-NEXT: sb a7, 114(sp) +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) ; RV64I-NEXT: srli a0, a0, 8 -; RV64I-NEXT: sb a0, 113(sp) -; RV64I-NEXT: sb a1, 111(sp) -; RV64I-NEXT: sb a3, 110(sp) -; RV64I-NEXT: sb a4, 109(sp) -; RV64I-NEXT: sb a5, 108(sp) -; RV64I-NEXT: sb a6, 107(sp) -; RV64I-NEXT: sb a7, 106(sp) -; RV64I-NEXT: sb a0, 105(sp) -; RV64I-NEXT: sb a1, 103(sp) -; RV64I-NEXT: sb a3, 102(sp) -; RV64I-NEXT: sb a4, 101(sp) -; RV64I-NEXT: sb a5, 100(sp) -; RV64I-NEXT: sb a6, 99(sp) -; RV64I-NEXT: sb a7, 98(sp) -; RV64I-NEXT: sb a0, 97(sp) -; RV64I-NEXT: sb a1, 95(sp) -; RV64I-NEXT: sb a3, 94(sp) -; RV64I-NEXT: sb a4, 93(sp) -; RV64I-NEXT: sb a5, 92(sp) -; RV64I-NEXT: sb a6, 91(sp) -; RV64I-NEXT: sb a7, 90(sp) -; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: andi a0, t0, 31 -; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a6, a1, a0 -; RV64I-NEXT: lbu a0, 8(a6) -; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a6) -; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a6) -; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a6) -; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a6) -; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a6) -; RV64I-NEXT: lbu t0, 14(a6) -; RV64I-NEXT: lbu t1, 15(a6) -; RV64I-NEXT: lbu t2, 0(a6) -; RV64I-NEXT: lbu t3, 1(a6) -; RV64I-NEXT: lbu t4, 2(a6) -; RV64I-NEXT: lbu t5, 3(a6) -; RV64I-NEXT: lbu t6, 4(a6) -; RV64I-NEXT: lbu s0, 5(a6) -; RV64I-NEXT: lbu s1, 6(a6) -; RV64I-NEXT: lbu s2, 7(a6) -; RV64I-NEXT: lbu s3, 24(a6) -; RV64I-NEXT: lbu s4, 25(a6) -; RV64I-NEXT: lbu s5, 26(a6) -; RV64I-NEXT: lbu s6, 27(a6) -; RV64I-NEXT: lbu s7, 28(a6) -; RV64I-NEXT: lbu s8, 29(a6) -; RV64I-NEXT: lbu s9, 30(a6) -; RV64I-NEXT: lbu s10, 31(a6) -; RV64I-NEXT: lbu s11, 16(a6) -; RV64I-NEXT: lbu ra, 17(a6) -; RV64I-NEXT: lbu a5, 18(a6) -; RV64I-NEXT: lbu a4, 19(a6) -; RV64I-NEXT: lbu a0, 23(a6) -; RV64I-NEXT: lbu a1, 22(a6) -; RV64I-NEXT: lbu a3, 21(a6) -; RV64I-NEXT: lbu a6, 20(a6) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a5, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload ; RV64I-NEXT: sb a0, 9(a2) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 224 +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; -; RV32I-LABEL: ashr_32bytes: +; RV32I-LABEL: shl_32bytes_dwordOff: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -144 -; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv t0, a1 -; RV32I-NEXT: lbu t1, 31(a0) -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: sw a1, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 1(a0) -; RV32I-NEXT: sw a1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 2(a0) -; RV32I-NEXT: sw a1, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 3(a0) -; RV32I-NEXT: sw a1, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 4(a0) -; RV32I-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t2, 6(a0) -; RV32I-NEXT: lbu t3, 7(a0) -; RV32I-NEXT: lbu t4, 8(a0) -; RV32I-NEXT: lbu t5, 9(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s3, 14(a0) -; RV32I-NEXT: lbu s4, 15(a0) -; RV32I-NEXT: lbu s5, 16(a0) -; RV32I-NEXT: lbu s6, 17(a0) -; RV32I-NEXT: lbu s7, 18(a0) -; RV32I-NEXT: lbu s8, 19(a0) -; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 21(a0) -; RV32I-NEXT: lbu s11, 22(a0) -; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: lbu a7, 24(a0) -; RV32I-NEXT: lbu a6, 25(a0) -; RV32I-NEXT: lbu a5, 26(a0) -; RV32I-NEXT: lbu a4, 27(a0) -; RV32I-NEXT: lbu a1, 30(a0) -; RV32I-NEXT: lbu a3, 29(a0) -; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: lbu t0, 0(t0) -; RV32I-NEXT: sb a1, 58(sp) -; RV32I-NEXT: sb a3, 57(sp) -; RV32I-NEXT: sb a0, 56(sp) -; RV32I-NEXT: sb a4, 55(sp) -; RV32I-NEXT: sb a5, 54(sp) -; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb t1, 59(sp) +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: sb a7, 52(sp) -; RV32I-NEXT: sb ra, 51(sp) -; RV32I-NEXT: sb s11, 50(sp) -; RV32I-NEXT: sb s10, 49(sp) -; RV32I-NEXT: sb s9, 48(sp) -; RV32I-NEXT: sb s8, 47(sp) -; RV32I-NEXT: sb s7, 46(sp) -; RV32I-NEXT: sb s6, 45(sp) -; RV32I-NEXT: sb s5, 44(sp) -; RV32I-NEXT: sb s4, 43(sp) -; RV32I-NEXT: sb s3, 42(sp) -; RV32I-NEXT: sb s2, 41(sp) -; RV32I-NEXT: sb s1, 40(sp) -; RV32I-NEXT: sb s0, 39(sp) -; RV32I-NEXT: sb t6, 38(sp) -; RV32I-NEXT: sb t5, 37(sp) -; RV32I-NEXT: sb t4, 36(sp) -; RV32I-NEXT: sb t3, 35(sp) -; RV32I-NEXT: sb t2, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t1, 31 -; RV32I-NEXT: sb a0, 88(sp) -; RV32I-NEXT: sb a0, 84(sp) -; RV32I-NEXT: sb a0, 80(sp) -; RV32I-NEXT: sb a0, 76(sp) -; RV32I-NEXT: sb a0, 72(sp) -; RV32I-NEXT: sb a0, 68(sp) -; RV32I-NEXT: sb a0, 64(sp) -; RV32I-NEXT: sb a0, 60(sp) +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, t2 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw t1, 56(sp) +; RV32I-NEXT: sw t0, 52(sp) +; RV32I-NEXT: sw a7, 48(sp) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: andi a1, a1, 24 +; RV32I-NEXT: addi a0, sp, 32 +; RV32I-NEXT: sub a3, a0, a1 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a1, 0(a3) +; RV32I-NEXT: lw a4, 12(a3) +; RV32I-NEXT: lw a5, 8(a3) +; RV32I-NEXT: lw a6, 24(a3) +; RV32I-NEXT: lw a7, 28(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw a3, 20(a3) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb a3, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli t1, a6, 24 +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 25(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 29(a2) +; RV32I-NEXT: srli a6, t0, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: sb a6, 18(a2) +; RV32I-NEXT: srli a6, t0, 8 +; RV32I-NEXT: sb a6, 17(a2) +; RV32I-NEXT: srli a6, a3, 24 +; RV32I-NEXT: sb a6, 23(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) ; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: sb a1, 91(sp) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 90(sp) +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 89(sp) -; RV32I-NEXT: sb a1, 87(sp) -; RV32I-NEXT: sb a3, 86(sp) -; RV32I-NEXT: sb a0, 85(sp) -; RV32I-NEXT: sb a1, 83(sp) -; RV32I-NEXT: sb a3, 82(sp) -; RV32I-NEXT: sb a0, 81(sp) -; RV32I-NEXT: sb a1, 79(sp) -; RV32I-NEXT: sb a3, 78(sp) -; RV32I-NEXT: sb a0, 77(sp) -; RV32I-NEXT: sb a1, 75(sp) -; RV32I-NEXT: sb a3, 74(sp) -; RV32I-NEXT: sb a0, 73(sp) -; RV32I-NEXT: sb a1, 71(sp) -; RV32I-NEXT: sb a3, 70(sp) -; RV32I-NEXT: sb a0, 69(sp) -; RV32I-NEXT: sb a1, 67(sp) -; RV32I-NEXT: sb a3, 66(sp) -; RV32I-NEXT: sb a0, 65(sp) -; RV32I-NEXT: sb a1, 63(sp) -; RV32I-NEXT: sb a3, 62(sp) -; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: andi a0, t0, 31 -; RV32I-NEXT: addi a1, sp, 28 -; RV32I-NEXT: add a6, a1, a0 -; RV32I-NEXT: lbu a0, 6(a6) -; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a6) -; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a6) -; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a6) -; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a6) -; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a6) -; RV32I-NEXT: lbu t0, 2(a6) -; RV32I-NEXT: lbu t1, 3(a6) -; RV32I-NEXT: lbu t2, 14(a6) -; RV32I-NEXT: lbu t3, 15(a6) -; RV32I-NEXT: lbu t4, 12(a6) -; RV32I-NEXT: lbu t5, 13(a6) -; RV32I-NEXT: lbu t6, 10(a6) -; RV32I-NEXT: lbu s0, 11(a6) -; RV32I-NEXT: lbu s1, 8(a6) -; RV32I-NEXT: lbu s2, 9(a6) -; RV32I-NEXT: lbu s3, 22(a6) -; RV32I-NEXT: lbu s4, 23(a6) -; RV32I-NEXT: lbu s5, 20(a6) -; RV32I-NEXT: lbu s6, 21(a6) -; RV32I-NEXT: lbu s7, 18(a6) -; RV32I-NEXT: lbu s8, 19(a6) -; RV32I-NEXT: lbu s9, 16(a6) -; RV32I-NEXT: lbu s10, 17(a6) -; RV32I-NEXT: lbu s11, 30(a6) -; RV32I-NEXT: lbu ra, 31(a6) -; RV32I-NEXT: lbu a5, 28(a6) -; RV32I-NEXT: lbu a4, 29(a6) -; RV32I-NEXT: lbu a0, 25(a6) -; RV32I-NEXT: lbu a1, 24(a6) -; RV32I-NEXT: lbu a3, 27(a6) -; RV32I-NEXT: lbu a6, 26(a6) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_32bytes: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 0(a1) +; RV64I-NEXT: lbu t1, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: sraiw a0, a0, 31 +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a0, 40(sp) +; RV64I-NEXT: sd a0, 32(sp) +; RV64I-NEXT: sd a6, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: andi a0, a1, 24 +; RV64I-NEXT: mv a3, sp +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: srl a5, a4, a1 +; RV64I-NEXT: ld a6, 16(a3) +; RV64I-NEXT: andi a0, a1, 56 +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 0(a3) +; RV64I-NEXT: slli a0, a6, 1 +; RV64I-NEXT: sll a0, a0, a7 +; RV64I-NEXT: or a0, a5, a0 +; RV64I-NEXT: srl t0, t0, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sll a4, a4, a7 +; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: slli t1, a3, 1 +; RV64I-NEXT: sll a7, t1, a7 +; RV64I-NEXT: or a7, a6, a7 +; RV64I-NEXT: sra a1, a3, a1 +; RV64I-NEXT: sb a6, 16(a2) +; RV64I-NEXT: sb a1, 24(a2) +; RV64I-NEXT: sb t0, 0(a2) +; RV64I-NEXT: sb a5, 8(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 31(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 30(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 27(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 26(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: srli a1, a7, 56 +; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: srli a1, a7, 48 +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: srli a1, a7, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a7, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a7, 24 +; RV64I-NEXT: sb a1, 19(a2) +; RV64I-NEXT: srli a1, a7, 16 +; RV64I-NEXT: sb a1, 18(a2) +; RV64I-NEXT: srli a1, a7, 8 +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_32bytes: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -80 +; RV32I-NEXT: sw s0, 76(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 72(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 68(sp) # 4-byte Folded Spill +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, a0, t4 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu t3, 1(a1) +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli t3, t3, 8 +; RV32I-NEXT: or t3, t3, t4 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t5 +; RV32I-NEXT: or a1, a1, t3 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a0, 56(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 44(sp) +; RV32I-NEXT: sw a0, 40(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: andi a0, a1, 28 +; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: add a5, a3, a0 +; RV32I-NEXT: lw a3, 4(a5) +; RV32I-NEXT: slli a6, a1, 3 +; RV32I-NEXT: srl a4, a3, a6 +; RV32I-NEXT: lw a7, 8(a5) +; RV32I-NEXT: andi a0, a6, 24 +; RV32I-NEXT: xori t0, a0, 31 +; RV32I-NEXT: lw a1, 0(a5) +; RV32I-NEXT: slli a0, a7, 1 +; RV32I-NEXT: sll a0, a0, t0 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: srl t1, a1, a6 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: lw t2, 12(a5) +; RV32I-NEXT: lw t3, 16(a5) +; RV32I-NEXT: sll a1, a3, t0 +; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: srl t4, t2, a6 +; RV32I-NEXT: slli a3, t3, 1 +; RV32I-NEXT: sll a3, a3, t0 +; RV32I-NEXT: or a3, t4, a3 +; RV32I-NEXT: srl a7, a7, a6 +; RV32I-NEXT: slli t2, t2, 1 +; RV32I-NEXT: lw t5, 20(a5) +; RV32I-NEXT: lw t6, 24(a5) +; RV32I-NEXT: sll t2, t2, t0 +; RV32I-NEXT: or t2, a7, t2 +; RV32I-NEXT: srl s0, t5, a6 +; RV32I-NEXT: slli s1, t6, 1 +; RV32I-NEXT: sll s1, s1, t0 +; RV32I-NEXT: or s1, s0, s1 +; RV32I-NEXT: srl t3, t3, a6 +; RV32I-NEXT: slli t5, t5, 1 +; RV32I-NEXT: lw a5, 28(a5) +; RV32I-NEXT: sll t5, t5, t0 +; RV32I-NEXT: or t5, t3, t5 +; RV32I-NEXT: srl t6, t6, a6 +; RV32I-NEXT: slli s2, a5, 1 +; RV32I-NEXT: sll t0, s2, t0 +; RV32I-NEXT: or t0, t6, t0 +; RV32I-NEXT: sra a5, a5, a6 +; RV32I-NEXT: sb t6, 24(a2) ; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) +; RV32I-NEXT: sb t3, 16(a2) +; RV32I-NEXT: sb s0, 20(a2) +; RV32I-NEXT: sb a7, 8(a2) ; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: sb t1, 0(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a4, a5, 24 +; RV32I-NEXT: sb a4, 31(a2) +; RV32I-NEXT: srli a4, a5, 16 +; RV32I-NEXT: sb a4, 30(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 29(a2) +; RV32I-NEXT: srli a4, t0, 24 +; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a4, t0, 16 +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: srli a4, t0, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t5, 24 +; RV32I-NEXT: sb a4, 19(a2) +; RV32I-NEXT: srli a4, t5, 16 +; RV32I-NEXT: sb a4, 18(a2) +; RV32I-NEXT: srli a4, t5, 8 +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: srli a4, s1, 24 +; RV32I-NEXT: sb a4, 23(a2) +; RV32I-NEXT: srli a4, s1, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: srli s1, s1, 8 +; RV32I-NEXT: sb s1, 21(a2) +; RV32I-NEXT: srli a4, t2, 24 +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a4, t2, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a4, t2, 8 +; RV32I-NEXT: sb a4, 9(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 6(a2) -; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 144 +; RV32I-NEXT: lw s0, 76(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 72(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 68(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 80 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -2614,3 +4740,744 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { store i256 %res, ptr %dst, align 1 ret void } + +define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_32bytes_wordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 0(a1) +; RV64I-NEXT: lbu t1, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a1, a1, t0 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: sraiw a0, a0, 31 +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a0, 40(sp) +; RV64I-NEXT: sd a0, 32(sp) +; RV64I-NEXT: sd a6, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: slli a0, a1, 2 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: mv a3, sp +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: slli a5, a1, 5 +; RV64I-NEXT: srl a1, a4, a5 +; RV64I-NEXT: ld a6, 16(a3) +; RV64I-NEXT: andi a0, a5, 32 +; RV64I-NEXT: xori a7, a0, 63 +; RV64I-NEXT: ld t0, 0(a3) +; RV64I-NEXT: slli a0, a6, 1 +; RV64I-NEXT: sll a0, a0, a7 +; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: srl t0, t0, a5 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sll a4, a4, a7 +; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: srl a6, a6, a5 +; RV64I-NEXT: slli t1, a3, 1 +; RV64I-NEXT: sll a7, t1, a7 +; RV64I-NEXT: or a7, a6, a7 +; RV64I-NEXT: sra a3, a3, a5 +; RV64I-NEXT: sb a6, 16(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb t0, 0(a2) +; RV64I-NEXT: sb a1, 8(a2) +; RV64I-NEXT: srli a5, a6, 24 +; RV64I-NEXT: sb a5, 19(a2) +; RV64I-NEXT: srli a5, a6, 16 +; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: srli a5, a6, 8 +; RV64I-NEXT: sb a5, 17(a2) +; RV64I-NEXT: srli a5, a3, 56 +; RV64I-NEXT: sb a5, 31(a2) +; RV64I-NEXT: srli a5, a3, 48 +; RV64I-NEXT: sb a5, 30(a2) +; RV64I-NEXT: srli a5, a3, 40 +; RV64I-NEXT: sb a5, 29(a2) +; RV64I-NEXT: srli a5, a3, 32 +; RV64I-NEXT: sb a5, 28(a2) +; RV64I-NEXT: srli a5, a3, 24 +; RV64I-NEXT: sb a5, 27(a2) +; RV64I-NEXT: srli a5, a3, 16 +; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a3, t0, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, t0, 16 +; RV64I-NEXT: sb a3, 2(a2) +; RV64I-NEXT: srli a3, t0, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 10(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: srli a1, a7, 56 +; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: srli a1, a7, 48 +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: srli a1, a7, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a7, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a4, a4, 32 +; RV64I-NEXT: sb a4, 4(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a0, a0, 32 +; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_32bytes_wordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, a0, t4 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a0, 56(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 44(sp) +; RV32I-NEXT: sw a0, 40(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: andi a1, a1, 28 +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: add a3, a0, a1 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a1, 0(a3) +; RV32I-NEXT: lw a4, 12(a3) +; RV32I-NEXT: lw a5, 8(a3) +; RV32I-NEXT: lw a6, 24(a3) +; RV32I-NEXT: lw a7, 28(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw a3, 20(a3) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb a3, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli t1, a6, 24 +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 25(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 29(a2) +; RV32I-NEXT: srli a6, t0, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: sb a6, 18(a2) +; RV32I-NEXT: srli a6, t0, 8 +; RV32I-NEXT: sb a6, 17(a2) +; RV32I-NEXT: srli a6, a3, 24 +; RV32I-NEXT: sb a6, 23(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %wordOff = load i256, ptr %wordOff.ptr, align 1 + %bitOff = shl i256 %wordOff, 5 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; RV64I-LABEL: ashr_32bytes_dwordOff: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: lbu a3, 1(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a7, a0, 32 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a1, 0(a1) +; RV64I-NEXT: sraiw a0, a0, 31 +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a0, 40(sp) +; RV64I-NEXT: sd a0, 32(sp) +; RV64I-NEXT: sd a6, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: slli a1, a1, 3 +; RV64I-NEXT: andi a1, a1, 24 +; RV64I-NEXT: mv a0, sp +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: ld a1, 16(a0) +; RV64I-NEXT: ld a3, 24(a0) +; RV64I-NEXT: ld a4, 0(a0) +; RV64I-NEXT: ld a0, 8(a0) +; RV64I-NEXT: sb a1, 16(a2) +; RV64I-NEXT: sb a3, 24(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: sb a5, 23(a2) +; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: sb a5, 22(a2) +; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: sb a5, 21(a2) +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: sb a5, 20(a2) +; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: sb a5, 19(a2) +; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: sb a5, 18(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: srli a1, a3, 56 +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a3, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, a3, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, a3, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, a3, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, a3, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 25(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a4, a4, 8 +; RV64I-NEXT: sb a4, 1(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV32I-LABEL: ashr_32bytes_dwordOff: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -64 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: slli a5, a5, 16 +; RV32I-NEXT: slli a6, a6, 24 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: lbu t0, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or t0, t0, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: lbu t1, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli t1, t1, 8 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: lbu t2, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, a0, t4 +; RV32I-NEXT: or t2, t3, t2 +; RV32I-NEXT: lbu a1, 0(a1) +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a0, 56(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 44(sp) +; RV32I-NEXT: sw a0, 40(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw t2, 28(sp) +; RV32I-NEXT: sw t1, 24(sp) +; RV32I-NEXT: sw t0, 20(sp) +; RV32I-NEXT: sw a7, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: andi a1, a1, 24 +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: add a3, a0, a1 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a1, 0(a3) +; RV32I-NEXT: lw a4, 12(a3) +; RV32I-NEXT: lw a5, 8(a3) +; RV32I-NEXT: lw a6, 24(a3) +; RV32I-NEXT: lw a7, 28(a3) +; RV32I-NEXT: lw t0, 16(a3) +; RV32I-NEXT: lw a3, 20(a3) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb a7, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb a3, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a4, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli t1, a6, 24 +; RV32I-NEXT: sb t1, 27(a2) +; RV32I-NEXT: srli t1, a6, 16 +; RV32I-NEXT: sb t1, 26(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 25(a2) +; RV32I-NEXT: srli a6, a7, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a7, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: srli a6, a7, 8 +; RV32I-NEXT: sb a6, 29(a2) +; RV32I-NEXT: srli a6, t0, 24 +; RV32I-NEXT: sb a6, 19(a2) +; RV32I-NEXT: srli a6, t0, 16 +; RV32I-NEXT: sb a6, 18(a2) +; RV32I-NEXT: srli a6, t0, 8 +; RV32I-NEXT: sb a6, 17(a2) +; RV32I-NEXT: srli a6, a3, 24 +; RV32I-NEXT: sb a6, 23(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 22(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 21(a2) +; RV32I-NEXT: srli a3, a5, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, a5, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: sb a3, 14(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: ret + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 6 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index a601256bc2af..7e879b137b4f 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -704,164 +704,117 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: lshr_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 1(a1) -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: lbu s1, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: or a1, a1, s0 -; RV32I-NEXT: sb zero, 43(sp) -; RV32I-NEXT: sb zero, 42(sp) -; RV32I-NEXT: sb zero, 41(sp) -; RV32I-NEXT: sb zero, 40(sp) -; RV32I-NEXT: sb zero, 39(sp) -; RV32I-NEXT: sb zero, 38(sp) -; RV32I-NEXT: sb zero, 37(sp) -; RV32I-NEXT: sb zero, 36(sp) -; RV32I-NEXT: sb zero, 35(sp) -; RV32I-NEXT: sb zero, 34(sp) -; RV32I-NEXT: sb zero, 33(sp) -; RV32I-NEXT: sb zero, 32(sp) -; RV32I-NEXT: sb zero, 31(sp) -; RV32I-NEXT: sb zero, 30(sp) -; RV32I-NEXT: sb zero, 29(sp) -; RV32I-NEXT: sb zero, 28(sp) -; RV32I-NEXT: sb a0, 27(sp) -; RV32I-NEXT: sb s4, 26(sp) -; RV32I-NEXT: sb s3, 25(sp) -; RV32I-NEXT: sb s2, 24(sp) -; RV32I-NEXT: sb t6, 23(sp) -; RV32I-NEXT: sb t5, 22(sp) -; RV32I-NEXT: sb t4, 21(sp) -; RV32I-NEXT: sb t3, 20(sp) -; RV32I-NEXT: sb t2, 19(sp) -; RV32I-NEXT: sb t1, 18(sp) -; RV32I-NEXT: sb t0, 17(sp) -; RV32I-NEXT: sb a7, 16(sp) -; RV32I-NEXT: sb a6, 15(sp) -; RV32I-NEXT: sb a5, 14(sp) -; RV32I-NEXT: sb a4, 13(sp) -; RV32I-NEXT: sb a3, 12(sp) -; RV32I-NEXT: slli a0, a1, 25 -; RV32I-NEXT: srli a0, a0, 28 -; RV32I-NEXT: addi a3, sp, 12 -; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: srl a0, a5, a4 -; RV32I-NEXT: lbu a1, 9(a3) -; RV32I-NEXT: lbu a6, 8(a3) -; RV32I-NEXT: lbu a7, 10(a3) -; RV32I-NEXT: lbu t0, 11(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 ; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a4 -; RV32I-NEXT: sll a1, a1, a7 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu a7, 1(a3) -; RV32I-NEXT: lbu t0, 0(a3) -; RV32I-NEXT: lbu t1, 2(a3) -; RV32I-NEXT: lbu t2, 3(a3) -; RV32I-NEXT: slli a7, a7, 8 -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a4 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: xori t0, a4, 31 -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: lbu t1, 13(a3) -; RV32I-NEXT: lbu t2, 12(a3) -; RV32I-NEXT: lbu t3, 14(a3) -; RV32I-NEXT: lbu a3, 15(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t0, t1, t0 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a4 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a4, a6, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: srli a4, a3, 16 -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a7, 16 -; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a7, 8 -; RV32I-NEXT: sb a3, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw a0, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: srl a4, a3, a1 +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: andi a6, a1, 31 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: lw a7, 0(a0) +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or a4, a4, t0 +; RV32I-NEXT: srl a7, a7, a1 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: sll a3, a3, a6 +; RV32I-NEXT: or a3, a7, a3 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli a7, a0, 1 +; RV32I-NEXT: sll a6, a7, a6 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 15(a2) ; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a3, 0(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a0, a5, 16 +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: srli a0, a5, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: sb a0, 2(a2) +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a0, 3(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: srli a0, a4, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, a4, 24 +; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 %bitOff = load i128, ptr %bitOff.ptr, align 1 @@ -987,164 +940,117 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: shl_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: lbu a4, 1(a0) +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) ; RV32I-NEXT: lbu a5, 2(a0) ; RV32I-NEXT: lbu a6, 3(a0) -; RV32I-NEXT: lbu a7, 4(a0) -; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 1(a1) -; RV32I-NEXT: lbu s1, 0(a1) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: lbu s1, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: or a1, a1, s0 -; RV32I-NEXT: sb zero, 27(sp) -; RV32I-NEXT: sb zero, 26(sp) -; RV32I-NEXT: sb zero, 25(sp) -; RV32I-NEXT: sb zero, 24(sp) -; RV32I-NEXT: sb zero, 23(sp) -; RV32I-NEXT: sb zero, 22(sp) -; RV32I-NEXT: sb zero, 21(sp) -; RV32I-NEXT: sb zero, 20(sp) -; RV32I-NEXT: sb zero, 19(sp) -; RV32I-NEXT: sb zero, 18(sp) -; RV32I-NEXT: sb zero, 17(sp) -; RV32I-NEXT: sb zero, 16(sp) -; RV32I-NEXT: sb zero, 15(sp) -; RV32I-NEXT: sb zero, 14(sp) -; RV32I-NEXT: sb zero, 13(sp) -; RV32I-NEXT: sb zero, 12(sp) -; RV32I-NEXT: sb a0, 43(sp) -; RV32I-NEXT: sb s4, 42(sp) -; RV32I-NEXT: sb s3, 41(sp) -; RV32I-NEXT: sb s2, 40(sp) -; RV32I-NEXT: sb t6, 39(sp) -; RV32I-NEXT: sb t5, 38(sp) -; RV32I-NEXT: sb t4, 37(sp) -; RV32I-NEXT: sb t3, 36(sp) -; RV32I-NEXT: sb t2, 35(sp) -; RV32I-NEXT: sb t1, 34(sp) -; RV32I-NEXT: sb t0, 33(sp) -; RV32I-NEXT: sb a7, 32(sp) -; RV32I-NEXT: sb a6, 31(sp) -; RV32I-NEXT: sb a5, 30(sp) -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a3, 28(sp) -; RV32I-NEXT: slli a0, a1, 25 -; RV32I-NEXT: srli a0, a0, 28 -; RV32I-NEXT: addi a3, sp, 28 -; RV32I-NEXT: sub a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: sll a0, a5, a4 -; RV32I-NEXT: lbu a1, 1(a3) -; RV32I-NEXT: lbu a6, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t0, 3(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 ; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: srli a1, a6, 1 -; RV32I-NEXT: xori a7, a4, 31 -; RV32I-NEXT: srl a1, a1, a7 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu t0, 13(a3) -; RV32I-NEXT: lbu t1, 12(a3) -; RV32I-NEXT: lbu t2, 14(a3) -; RV32I-NEXT: lbu t3, 15(a3) -; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: or t0, t0, t1 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t1, t3, t2 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t0, t0, a4 -; RV32I-NEXT: lbu t1, 9(a3) -; RV32I-NEXT: lbu t2, 8(a3) -; RV32I-NEXT: lbu t3, 10(a3) -; RV32I-NEXT: lbu a3, 11(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: srli t1, a3, 1 -; RV32I-NEXT: srl a7, t1, a7 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: sll a3, a3, a4 -; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: not t1, a4 -; RV32I-NEXT: srl a5, a5, t1 -; RV32I-NEXT: or a5, a3, a5 -; RV32I-NEXT: sll a4, a6, a4 -; RV32I-NEXT: sb a4, 0(a2) -; RV32I-NEXT: srli a6, a3, 16 -; RV32I-NEXT: sb a6, 10(a2) -; RV32I-NEXT: srli a6, a3, 24 -; RV32I-NEXT: sb a6, 11(a2) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 9(a2) -; RV32I-NEXT: srli a3, t0, 16 -; RV32I-NEXT: sb a3, 14(a2) -; RV32I-NEXT: srli a3, t0, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a3, t0, 8 -; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a4, 16 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: or a0, a0, a6 +; RV32I-NEXT: lbu a6, 1(a1) +; RV32I-NEXT: lbu a7, 0(a1) +; RV32I-NEXT: lbu t0, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t0 +; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a5, 24(sp) +; RV32I-NEXT: sw a4, 20(sp) +; RV32I-NEXT: sw a3, 16(sp) +; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: addi a3, sp, 16 +; RV32I-NEXT: sub a3, a3, a0 +; RV32I-NEXT: lw a0, 4(a3) +; RV32I-NEXT: lw a4, 0(a3) +; RV32I-NEXT: sll a5, a0, a1 +; RV32I-NEXT: andi a6, a1, 31 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: srli a7, a4, 1 +; RV32I-NEXT: lw t0, 12(a3) +; RV32I-NEXT: lw a3, 8(a3) +; RV32I-NEXT: srl a7, a7, a6 +; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: sll a7, t0, a1 +; RV32I-NEXT: srli t0, a3, 1 +; RV32I-NEXT: srl t0, t0, a6 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: sll a3, a3, a1 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: srl a0, a0, a6 +; RV32I-NEXT: or a0, a3, a0 +; RV32I-NEXT: sll a1, a4, a1 +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: srli a3, a1, 16 ; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a4, 24 +; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) -; RV32I-NEXT: srli a3, a0, 24 -; RV32I-NEXT: sb a3, 7(a2) -; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: sb a0, 8(a2) ; RV32I-NEXT: sb a7, 12(a2) -; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: sb a5, 4(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 10(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 11(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 9(a2) +; RV32I-NEXT: srli a0, a7, 16 +; RV32I-NEXT: sb a0, 14(a2) +; RV32I-NEXT: srli a0, a7, 24 +; RV32I-NEXT: sb a0, 15(a2) +; RV32I-NEXT: srli a0, a7, 8 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: srli a0, a5, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, a5, 24 +; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 %bitOff = load i128, ptr %bitOff.ptr, align 1 @@ -1270,171 +1176,118 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; RV32I-LABEL: ashr_16bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -64 -; RV32I-NEXT: sw s0, 60(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 56(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 52(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 48(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 15(a0) -; RV32I-NEXT: slli a4, a3, 24 -; RV32I-NEXT: lbu a5, 0(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 2(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 4(a0) -; RV32I-NEXT: lbu t2, 5(a0) -; RV32I-NEXT: lbu t3, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t5, 8(a0) -; RV32I-NEXT: lbu t6, 9(a0) -; RV32I-NEXT: lbu s0, 10(a0) -; RV32I-NEXT: lbu s1, 1(a1) -; RV32I-NEXT: lbu s2, 0(a1) -; RV32I-NEXT: lbu s3, 11(a0) -; RV32I-NEXT: lbu s4, 12(a0) -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or s1, s1, s2 -; RV32I-NEXT: lbu s2, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: lbu s5, 13(a0) -; RV32I-NEXT: lbu a0, 14(a0) -; RV32I-NEXT: slli s2, s2, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, s2 -; RV32I-NEXT: or a1, a1, s1 -; RV32I-NEXT: sb a3, 23(sp) -; RV32I-NEXT: sb a0, 22(sp) -; RV32I-NEXT: sb s5, 21(sp) -; RV32I-NEXT: sb s4, 20(sp) -; RV32I-NEXT: sb s3, 19(sp) -; RV32I-NEXT: sb s0, 18(sp) -; RV32I-NEXT: sb t6, 17(sp) -; RV32I-NEXT: sb t5, 16(sp) -; RV32I-NEXT: sb t4, 15(sp) -; RV32I-NEXT: sb t3, 14(sp) -; RV32I-NEXT: sb t2, 13(sp) -; RV32I-NEXT: sb t1, 12(sp) -; RV32I-NEXT: sb t0, 11(sp) -; RV32I-NEXT: sb a7, 10(sp) -; RV32I-NEXT: sb a6, 9(sp) -; RV32I-NEXT: sb a5, 8(sp) -; RV32I-NEXT: srai a4, a4, 31 -; RV32I-NEXT: sb a4, 36(sp) -; RV32I-NEXT: sb a4, 32(sp) -; RV32I-NEXT: sb a4, 28(sp) -; RV32I-NEXT: sb a4, 24(sp) -; RV32I-NEXT: srli a0, a4, 24 -; RV32I-NEXT: sb a0, 39(sp) -; RV32I-NEXT: srli a3, a4, 16 -; RV32I-NEXT: sb a3, 38(sp) -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 37(sp) -; RV32I-NEXT: sb a0, 35(sp) -; RV32I-NEXT: sb a3, 34(sp) -; RV32I-NEXT: sb a4, 33(sp) -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: sb a3, 30(sp) -; RV32I-NEXT: sb a4, 29(sp) -; RV32I-NEXT: sb a0, 27(sp) -; RV32I-NEXT: sb a3, 26(sp) -; RV32I-NEXT: sb a4, 25(sp) -; RV32I-NEXT: slli a0, a1, 25 -; RV32I-NEXT: srli a0, a0, 28 -; RV32I-NEXT: addi a3, sp, 8 -; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a5, a4, a0 -; RV32I-NEXT: andi a4, a1, 7 -; RV32I-NEXT: srl a0, a5, a4 -; RV32I-NEXT: lbu a1, 9(a3) -; RV32I-NEXT: lbu a6, 8(a3) -; RV32I-NEXT: lbu a7, 10(a3) -; RV32I-NEXT: lbu t0, 11(a3) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 ; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a4 -; RV32I-NEXT: sll a1, a1, a7 -; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu a7, 1(a3) -; RV32I-NEXT: lbu t0, 0(a3) -; RV32I-NEXT: lbu t1, 2(a3) -; RV32I-NEXT: lbu t2, 3(a3) +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu a0, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a7, a0, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t0, 0(a1) +; RV32I-NEXT: lbu t1, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: or a7, a7, t0 ; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a4 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: xori t0, a4, 31 -; RV32I-NEXT: sll a5, a5, t0 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: lbu t1, 13(a3) -; RV32I-NEXT: lbu t2, 12(a3) -; RV32I-NEXT: lbu t3, 14(a3) -; RV32I-NEXT: lbu a3, 15(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t0, t1, t0 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sra a3, a3, a4 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a4, a6, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: srli a4, a6, 8 -; RV32I-NEXT: sb a4, 9(a2) -; RV32I-NEXT: srli a4, a3, 16 -; RV32I-NEXT: sb a4, 14(a2) -; RV32I-NEXT: srli a4, a3, 24 -; RV32I-NEXT: sb a4, 15(a2) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a7, 16 -; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a7, 8 -; RV32I-NEXT: sb a3, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t1 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw a0, 24(sp) +; RV32I-NEXT: sw a0, 20(sp) +; RV32I-NEXT: sw a0, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: srli a0, a1, 3 +; RV32I-NEXT: andi a0, a0, 12 +; RV32I-NEXT: mv a3, sp +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: lw a3, 4(a0) +; RV32I-NEXT: srl a4, a3, a1 +; RV32I-NEXT: lw a5, 8(a0) +; RV32I-NEXT: andi a6, a1, 31 +; RV32I-NEXT: xori a6, a6, 31 +; RV32I-NEXT: lw a7, 0(a0) +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or a4, a4, t0 +; RV32I-NEXT: srl a7, a7, a1 +; RV32I-NEXT: slli a3, a3, 1 +; RV32I-NEXT: lw a0, 12(a0) +; RV32I-NEXT: sll a3, a3, a6 +; RV32I-NEXT: or a3, a7, a3 +; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: slli a7, a0, 1 +; RV32I-NEXT: sll a6, a7, a6 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: sra a0, a0, a1 +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 14(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 15(a2) ; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 13(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a3, 0(a2) +; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: srli a0, a5, 16 +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: srli a0, a5, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 3(a2) -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 48(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 64 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: sb a0, 2(a2) +; RV32I-NEXT: srli a0, a3, 24 +; RV32I-NEXT: sb a0, 3(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: srli a0, a4, 16 +; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: srli a0, a4, 24 +; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 5(a2) +; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret %src = load i128, ptr %src.ptr, align 1 %bitOff = load i128, ptr %bitOff.ptr, align 1 @@ -1446,191 +1299,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -224 -; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 1(a1) -; RV64I-NEXT: lbu s10, 0(a1) -; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, s10 -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s11, s11, s9 -; RV64I-NEXT: lbu s9, 4(a1) -; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: lbu ra, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s9 -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: slli ra, ra, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, ra -; RV64I-NEXT: lbu ra, 22(a0) -; RV64I-NEXT: or a1, a1, s10 -; RV64I-NEXT: lbu s10, 23(a0) -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s11 -; RV64I-NEXT: lbu s11, 24(a0) -; RV64I-NEXT: lbu a7, 25(a0) -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu a5, 27(a0) -; RV64I-NEXT: lbu a1, 31(a0) -; RV64I-NEXT: lbu a3, 30(a0) -; RV64I-NEXT: lbu a4, 29(a0) -; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: sb a1, 87(sp) -; RV64I-NEXT: sb a3, 86(sp) -; RV64I-NEXT: sb a4, 85(sp) -; RV64I-NEXT: sb a0, 84(sp) -; RV64I-NEXT: sb a5, 83(sp) -; RV64I-NEXT: sb a6, 82(sp) -; RV64I-NEXT: sb a7, 81(sp) -; RV64I-NEXT: sb s11, 80(sp) -; RV64I-NEXT: sb s10, 79(sp) -; RV64I-NEXT: sb ra, 78(sp) -; RV64I-NEXT: sb s9, 77(sp) -; RV64I-NEXT: sb s8, 76(sp) -; RV64I-NEXT: sb s7, 75(sp) -; RV64I-NEXT: sb s6, 74(sp) -; RV64I-NEXT: sb s5, 73(sp) -; RV64I-NEXT: sb s4, 72(sp) -; RV64I-NEXT: sb s3, 71(sp) -; RV64I-NEXT: sb s2, 70(sp) -; RV64I-NEXT: sb s1, 69(sp) -; RV64I-NEXT: sb s0, 68(sp) -; RV64I-NEXT: sb t6, 67(sp) -; RV64I-NEXT: sb t5, 66(sp) -; RV64I-NEXT: sb t4, 65(sp) -; RV64I-NEXT: sb zero, 119(sp) -; RV64I-NEXT: sb zero, 118(sp) -; RV64I-NEXT: sb zero, 117(sp) -; RV64I-NEXT: sb zero, 116(sp) -; RV64I-NEXT: sb zero, 115(sp) -; RV64I-NEXT: sb zero, 114(sp) -; RV64I-NEXT: sb zero, 113(sp) -; RV64I-NEXT: sb zero, 112(sp) -; RV64I-NEXT: sb zero, 111(sp) -; RV64I-NEXT: sb zero, 110(sp) -; RV64I-NEXT: sb zero, 109(sp) -; RV64I-NEXT: sb zero, 108(sp) -; RV64I-NEXT: sb zero, 107(sp) -; RV64I-NEXT: sb zero, 106(sp) -; RV64I-NEXT: sb zero, 105(sp) -; RV64I-NEXT: sb zero, 104(sp) -; RV64I-NEXT: sb zero, 103(sp) -; RV64I-NEXT: sb zero, 102(sp) -; RV64I-NEXT: sb zero, 101(sp) -; RV64I-NEXT: sb zero, 100(sp) -; RV64I-NEXT: sb zero, 99(sp) -; RV64I-NEXT: sb zero, 98(sp) -; RV64I-NEXT: sb zero, 97(sp) -; RV64I-NEXT: sb zero, 96(sp) -; RV64I-NEXT: sb zero, 95(sp) -; RV64I-NEXT: sb zero, 94(sp) -; RV64I-NEXT: sb zero, 93(sp) -; RV64I-NEXT: sb zero, 92(sp) -; RV64I-NEXT: sb zero, 91(sp) -; RV64I-NEXT: sb zero, 90(sp) -; RV64I-NEXT: sb zero, 89(sp) -; RV64I-NEXT: sb zero, 88(sp) -; RV64I-NEXT: sb t3, 64(sp) -; RV64I-NEXT: sb t2, 63(sp) -; RV64I-NEXT: sb t1, 62(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 61(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 60(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 59(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 58(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 57(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: slli a0, t0, 56 -; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a3, sp, 56 -; RV64I-NEXT: add a3, a3, a0 -; RV64I-NEXT: lbu a0, 9(a3) -; RV64I-NEXT: lbu a1, 8(a3) -; RV64I-NEXT: lbu a4, 10(a3) -; RV64I-NEXT: lbu a5, 11(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a1, 13(a3) -; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a5, 14(a3) -; RV64I-NEXT: lbu a6, 15(a3) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a4, a1, a0 -; RV64I-NEXT: andi a1, t0, 7 -; RV64I-NEXT: lbu a0, 17(a3) -; RV64I-NEXT: lbu a5, 16(a3) -; RV64I-NEXT: lbu a6, 18(a3) -; RV64I-NEXT: lbu a7, 19(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a3) -; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu a7, 22(a3) -; RV64I-NEXT: lbu t0, 23(a3) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -1638,92 +1343,138 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a5, a5, a0 -; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a1 -; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a3) -; RV64I-NEXT: lbu a7, 0(a3) -; RV64I-NEXT: lbu t0, 2(a3) -; RV64I-NEXT: lbu t1, 3(a3) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a3) -; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t1, 6(a3) -; RV64I-NEXT: lbu t2, 7(a3) +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a3) -; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t1, 26(a3) -; RV64I-NEXT: lbu t2, 27(a3) +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: lbu t1, 28(a3) -; RV64I-NEXT: lbu t2, 30(a3) -; RV64I-NEXT: lbu a3, 31(a3) -; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: or t0, t0, t1 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a3, t2 -; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a3, a3, t0 -; RV64I-NEXT: xori t0, a1, 63 -; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a7, a3, a7 -; RV64I-NEXT: slli a3, a7, 1 -; RV64I-NEXT: sll t0, a3, t0 -; RV64I-NEXT: srl a3, a4, a1 -; RV64I-NEXT: srl a4, a6, a1 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: sd zero, 56(sp) +; RV64I-NEXT: sd zero, 48(sp) +; RV64I-NEXT: sd zero, 40(sp) +; RV64I-NEXT: sd zero, 32(sp) +; RV64I-NEXT: sd a0, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: mv a3, sp +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: srl a0, a4, a1 +; RV64I-NEXT: ld a5, 16(a3) +; RV64I-NEXT: andi a6, a1, 63 +; RV64I-NEXT: xori a6, a6, 63 +; RV64I-NEXT: ld a7, 0(a3) +; RV64I-NEXT: slli t0, a5, 1 +; RV64I-NEXT: sll t0, t0, a6 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: srl a7, a7, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sll a4, a4, a6 +; RV64I-NEXT: or a4, a7, a4 ; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: srl a1, a7, a1 -; RV64I-NEXT: srli a6, a5, 48 -; RV64I-NEXT: sb a6, 22(a2) -; RV64I-NEXT: srli a6, a5, 40 -; RV64I-NEXT: sb a6, 21(a2) -; RV64I-NEXT: srli a6, a5, 32 -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: sb a6, 19(a2) -; RV64I-NEXT: srli a6, a5, 16 -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: or a6, a5, t0 -; RV64I-NEXT: sb a5, 16(a2) -; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a1, 56 -; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a1, 48 -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a1, 32 -; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a1, 24 -; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a1, 16 -; RV64I-NEXT: sb a5, 26(a2) +; RV64I-NEXT: slli a7, a3, 1 +; RV64I-NEXT: sll a6, a7, a6 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: srl a1, a3, a1 ; RV64I-NEXT: sb a1, 24(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 31(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 30(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 27(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 26(a2) ; RV64I-NEXT: srli a1, a1, 8 ; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb a5, 16(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a1, a5, 56 +; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: srli a1, a5, 48 +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: srli a1, a5, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a5, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a5, 24 +; RV64I-NEXT: sb a1, 19(a2) +; RV64I-NEXT: srli a1, a5, 16 +; RV64I-NEXT: sb a1, 18(a2) +; RV64I-NEXT: srli a5, a5, 8 +; RV64I-NEXT: sb a5, 17(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) ; RV64I-NEXT: srli a1, a4, 48 ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a4, 40 @@ -1734,366 +1485,234 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 3(a2) ; RV64I-NEXT: srli a1, a4, 16 ; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a4, t1 -; RV64I-NEXT: sb a4, 0(a2) ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: srli a4, a3, 48 -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: srli a4, a3, 40 -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: srli a4, a3, 32 -; RV64I-NEXT: sb a4, 12(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: sb a4, 11(a2) -; RV64I-NEXT: srli a4, a3, 16 -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: sb a3, 8(a2) -; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 9(a2) -; RV64I-NEXT: srli a3, a6, 56 -; RV64I-NEXT: sb a3, 23(a2) -; RV64I-NEXT: srli a1, a1, 56 -; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: sb a0, 15(a2) -; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 224 +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -144 -; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -64 ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s10, 1(a1) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s10, s10, s11 -; RV32I-NEXT: lbu s11, 22(a0) -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, ra -; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s10 -; RV32I-NEXT: lbu s10, 24(a0) -; RV32I-NEXT: lbu a7, 25(a0) -; RV32I-NEXT: lbu a6, 26(a0) -; RV32I-NEXT: lbu a5, 27(a0) -; RV32I-NEXT: lbu a1, 31(a0) -; RV32I-NEXT: lbu a3, 30(a0) -; RV32I-NEXT: lbu a4, 29(a0) -; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: sb a1, 59(sp) -; RV32I-NEXT: sb a3, 58(sp) -; RV32I-NEXT: sb a4, 57(sp) -; RV32I-NEXT: sb a0, 56(sp) -; RV32I-NEXT: sb a5, 55(sp) -; RV32I-NEXT: sb a6, 54(sp) -; RV32I-NEXT: sb a7, 53(sp) -; RV32I-NEXT: sb s10, 52(sp) -; RV32I-NEXT: sb ra, 51(sp) -; RV32I-NEXT: sb s11, 50(sp) -; RV32I-NEXT: sb s9, 49(sp) -; RV32I-NEXT: sb s8, 48(sp) -; RV32I-NEXT: sb s7, 47(sp) -; RV32I-NEXT: sb s6, 46(sp) -; RV32I-NEXT: sb s5, 45(sp) -; RV32I-NEXT: sb s4, 44(sp) -; RV32I-NEXT: sb zero, 91(sp) -; RV32I-NEXT: sb zero, 90(sp) -; RV32I-NEXT: sb zero, 89(sp) -; RV32I-NEXT: sb zero, 88(sp) -; RV32I-NEXT: sb zero, 87(sp) -; RV32I-NEXT: sb zero, 86(sp) -; RV32I-NEXT: sb zero, 85(sp) -; RV32I-NEXT: sb zero, 84(sp) -; RV32I-NEXT: sb zero, 83(sp) -; RV32I-NEXT: sb zero, 82(sp) -; RV32I-NEXT: sb zero, 81(sp) -; RV32I-NEXT: sb zero, 80(sp) -; RV32I-NEXT: sb zero, 79(sp) -; RV32I-NEXT: sb zero, 78(sp) -; RV32I-NEXT: sb zero, 77(sp) -; RV32I-NEXT: sb zero, 76(sp) -; RV32I-NEXT: sb zero, 75(sp) -; RV32I-NEXT: sb zero, 74(sp) -; RV32I-NEXT: sb zero, 73(sp) -; RV32I-NEXT: sb zero, 72(sp) -; RV32I-NEXT: sb zero, 71(sp) -; RV32I-NEXT: sb zero, 70(sp) -; RV32I-NEXT: sb zero, 69(sp) -; RV32I-NEXT: sb zero, 68(sp) -; RV32I-NEXT: sb zero, 67(sp) -; RV32I-NEXT: sb zero, 66(sp) -; RV32I-NEXT: sb zero, 65(sp) -; RV32I-NEXT: sb zero, 64(sp) -; RV32I-NEXT: sb zero, 63(sp) -; RV32I-NEXT: sb zero, 62(sp) -; RV32I-NEXT: sb zero, 61(sp) -; RV32I-NEXT: sb zero, 60(sp) -; RV32I-NEXT: sb s3, 43(sp) -; RV32I-NEXT: sb s2, 42(sp) -; RV32I-NEXT: sb s1, 41(sp) -; RV32I-NEXT: sb s0, 40(sp) -; RV32I-NEXT: sb t6, 39(sp) -; RV32I-NEXT: sb t5, 38(sp) -; RV32I-NEXT: sb t4, 37(sp) -; RV32I-NEXT: sb t3, 36(sp) -; RV32I-NEXT: sb t2, 35(sp) -; RV32I-NEXT: sb t1, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: slli a0, t0, 24 -; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 28 -; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a3, t0, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a6, a5 -; RV32I-NEXT: or a6, a1, a0 -; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t1, a3 -; RV32I-NEXT: sll a0, a0, t1 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t0, 3(a4) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or t0, a5, a1 -; RV32I-NEXT: slli a1, t5, 1 -; RV32I-NEXT: xori t2, a3, 31 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) -; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t4, 15(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu a7, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t1, t1, a7 +; RV32I-NEXT: lbu a7, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t2 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t4, 18(a4) -; RV32I-NEXT: lbu t6, 19(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t2, t2, a7 +; RV32I-NEXT: lbu a7, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t3 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t4 -; RV32I-NEXT: or t4, a7, a5 -; RV32I-NEXT: slli a5, t4, 1 -; RV32I-NEXT: sll a7, a5, t1 -; RV32I-NEXT: lbu a5, 21(a4) -; RV32I-NEXT: lbu t6, 20(a4) -; RV32I-NEXT: lbu s0, 22(a4) -; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a5 -; RV32I-NEXT: lbu a5, 25(a4) -; RV32I-NEXT: lbu t6, 24(a4) -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: or t6, t6, a5 -; RV32I-NEXT: lbu a5, 29(a4) -; RV32I-NEXT: lbu s1, 28(a4) -; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t1, s2, t1 -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu a4, 31(a4) -; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, s0, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s3, a4, a5 -; RV32I-NEXT: slli a4, s3, 1 -; RV32I-NEXT: sll t2, a4, t2 -; RV32I-NEXT: srl a4, t5, a3 -; RV32I-NEXT: srl a5, t0, a3 -; RV32I-NEXT: srl t0, t3, a3 -; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t3, s0, a3 -; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t5, t6, a3 -; RV32I-NEXT: srl a3, s3, a3 -; RV32I-NEXT: srli t6, t5, 16 -; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: sb t5, 25(a2) -; RV32I-NEXT: srli t5, a3, 24 -; RV32I-NEXT: sb t5, 31(a2) -; RV32I-NEXT: srli t5, a3, 16 -; RV32I-NEXT: sb t5, 30(a2) -; RV32I-NEXT: sb a3, 28(a2) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, t4, 16 -; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or a3, t4, s1 -; RV32I-NEXT: sb t4, 16(a2) -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: srli t4, t3, 16 -; RV32I-NEXT: sb t4, 22(a2) -; RV32I-NEXT: or t1, t3, t1 -; RV32I-NEXT: sb t3, 20(a2) -; RV32I-NEXT: srli t3, t3, 8 -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, a6, 16 -; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, a6, s2 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: srli a7, t0, 8 -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a5, a4, 16 -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, a7 +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t3, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a7, a1, a7 +; RV32I-NEXT: sw zero, 60(sp) +; RV32I-NEXT: sw zero, 56(sp) +; RV32I-NEXT: sw zero, 52(sp) +; RV32I-NEXT: sw zero, 48(sp) +; RV32I-NEXT: sw zero, 44(sp) +; RV32I-NEXT: sw zero, 40(sp) +; RV32I-NEXT: sw zero, 36(sp) +; RV32I-NEXT: sw zero, 32(sp) +; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: sw t2, 24(sp) +; RV32I-NEXT: sw t1, 20(sp) +; RV32I-NEXT: sw t0, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: srli a0, a7, 3 +; RV32I-NEXT: andi a0, a0, 28 +; RV32I-NEXT: mv a1, sp +; RV32I-NEXT: add a4, a1, a0 +; RV32I-NEXT: lw a1, 4(a4) +; RV32I-NEXT: srl a0, a1, a7 +; RV32I-NEXT: lw a5, 8(a4) +; RV32I-NEXT: andi a3, a7, 31 +; RV32I-NEXT: xori a6, a3, 31 +; RV32I-NEXT: lw a3, 0(a4) +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: srl a3, a3, a7 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: lw t0, 12(a4) +; RV32I-NEXT: lw t1, 16(a4) +; RV32I-NEXT: sll a1, a1, a6 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: srl a3, t0, a7 +; RV32I-NEXT: slli t2, t1, 1 +; RV32I-NEXT: sll t2, t2, a6 +; RV32I-NEXT: or a3, a3, t2 +; RV32I-NEXT: srl a5, a5, a7 +; RV32I-NEXT: slli t0, t0, 1 +; RV32I-NEXT: lw t2, 20(a4) +; RV32I-NEXT: lw t3, 24(a4) +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or a5, a5, t0 +; RV32I-NEXT: srl t0, t2, a7 +; RV32I-NEXT: slli t4, t3, 1 +; RV32I-NEXT: sll t4, t4, a6 +; RV32I-NEXT: or t0, t0, t4 +; RV32I-NEXT: srl t1, t1, a7 +; RV32I-NEXT: slli t2, t2, 1 +; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: sll t2, t2, a6 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: srl t2, t3, a7 +; RV32I-NEXT: slli t3, a4, 1 +; RV32I-NEXT: sll a6, t3, a6 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: srl a4, a4, a7 +; RV32I-NEXT: sb a4, 28(a2) +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: sb a7, 31(a2) +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: sb a7, 30(a2) ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: srli a4, t2, 24 +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb t1, 16(a2) +; RV32I-NEXT: sb t0, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a4, a6, 24 ; RV32I-NEXT: sb a4, 27(a2) -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: sb a3, 19(a2) -; RV32I-NEXT: srli a3, t1, 24 -; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a3, t3, 24 -; RV32I-NEXT: sb a3, 11(a2) -; RV32I-NEXT: srli a3, a6, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 144 +; RV32I-NEXT: srli a4, a6, 16 +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: srli a4, a6, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t1, 24 +; RV32I-NEXT: sb a4, 19(a2) +; RV32I-NEXT: srli a4, t1, 16 +; RV32I-NEXT: sb a4, 18(a2) +; RV32I-NEXT: srli a4, t1, 8 +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: srli a4, t0, 24 +; RV32I-NEXT: sb a4, 23(a2) +; RV32I-NEXT: srli a4, t0, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: srli a4, t0, 8 +; RV32I-NEXT: sb a4, 21(a2) +; RV32I-NEXT: srli a4, a5, 24 +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a4, a5, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2104,191 +1723,43 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -224 -; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 6(a0) -; RV64I-NEXT: lbu t2, 7(a0) -; RV64I-NEXT: lbu t3, 8(a0) -; RV64I-NEXT: lbu t4, 9(a0) -; RV64I-NEXT: lbu t5, 10(a0) -; RV64I-NEXT: lbu t6, 11(a0) -; RV64I-NEXT: lbu s0, 12(a0) -; RV64I-NEXT: lbu s1, 13(a0) -; RV64I-NEXT: lbu s2, 14(a0) -; RV64I-NEXT: lbu s3, 15(a0) -; RV64I-NEXT: lbu s4, 16(a0) -; RV64I-NEXT: lbu s5, 17(a0) -; RV64I-NEXT: lbu s6, 18(a0) -; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 1(a1) -; RV64I-NEXT: lbu s10, 0(a1) -; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, s10 -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or s11, s11, s9 -; RV64I-NEXT: lbu s9, 4(a1) -; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: lbu ra, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s9 -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: slli ra, ra, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, ra -; RV64I-NEXT: lbu ra, 22(a0) -; RV64I-NEXT: or a1, a1, s10 -; RV64I-NEXT: lbu s10, 23(a0) -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t0, a1, s11 -; RV64I-NEXT: lbu s11, 24(a0) -; RV64I-NEXT: lbu a7, 25(a0) -; RV64I-NEXT: lbu a6, 26(a0) -; RV64I-NEXT: lbu a5, 27(a0) -; RV64I-NEXT: lbu a1, 31(a0) -; RV64I-NEXT: lbu a3, 30(a0) -; RV64I-NEXT: lbu a4, 29(a0) -; RV64I-NEXT: lbu a0, 28(a0) -; RV64I-NEXT: sb a1, 119(sp) -; RV64I-NEXT: sb a3, 118(sp) -; RV64I-NEXT: sb a4, 117(sp) -; RV64I-NEXT: sb a0, 116(sp) -; RV64I-NEXT: sb a5, 115(sp) -; RV64I-NEXT: sb a6, 114(sp) -; RV64I-NEXT: sb a7, 113(sp) -; RV64I-NEXT: sb s11, 112(sp) -; RV64I-NEXT: sb s10, 111(sp) -; RV64I-NEXT: sb ra, 110(sp) -; RV64I-NEXT: sb s9, 109(sp) -; RV64I-NEXT: sb s8, 108(sp) -; RV64I-NEXT: sb s7, 107(sp) -; RV64I-NEXT: sb s6, 106(sp) -; RV64I-NEXT: sb s5, 105(sp) -; RV64I-NEXT: sb s4, 104(sp) -; RV64I-NEXT: sb s3, 103(sp) -; RV64I-NEXT: sb s2, 102(sp) -; RV64I-NEXT: sb s1, 101(sp) -; RV64I-NEXT: sb s0, 100(sp) -; RV64I-NEXT: sb t6, 99(sp) -; RV64I-NEXT: sb t5, 98(sp) -; RV64I-NEXT: sb t4, 97(sp) -; RV64I-NEXT: sb t3, 96(sp) -; RV64I-NEXT: sb zero, 87(sp) -; RV64I-NEXT: sb zero, 86(sp) -; RV64I-NEXT: sb zero, 85(sp) -; RV64I-NEXT: sb zero, 84(sp) -; RV64I-NEXT: sb zero, 83(sp) -; RV64I-NEXT: sb zero, 82(sp) -; RV64I-NEXT: sb zero, 81(sp) -; RV64I-NEXT: sb zero, 80(sp) -; RV64I-NEXT: sb zero, 79(sp) -; RV64I-NEXT: sb zero, 78(sp) -; RV64I-NEXT: sb zero, 77(sp) -; RV64I-NEXT: sb zero, 76(sp) -; RV64I-NEXT: sb zero, 75(sp) -; RV64I-NEXT: sb zero, 74(sp) -; RV64I-NEXT: sb zero, 73(sp) -; RV64I-NEXT: sb zero, 72(sp) -; RV64I-NEXT: sb zero, 71(sp) -; RV64I-NEXT: sb zero, 70(sp) -; RV64I-NEXT: sb zero, 69(sp) -; RV64I-NEXT: sb zero, 68(sp) -; RV64I-NEXT: sb zero, 67(sp) -; RV64I-NEXT: sb zero, 66(sp) -; RV64I-NEXT: sb zero, 65(sp) -; RV64I-NEXT: sb zero, 64(sp) -; RV64I-NEXT: sb zero, 63(sp) -; RV64I-NEXT: sb zero, 62(sp) -; RV64I-NEXT: sb zero, 61(sp) -; RV64I-NEXT: sb zero, 60(sp) -; RV64I-NEXT: sb zero, 59(sp) -; RV64I-NEXT: sb zero, 58(sp) -; RV64I-NEXT: sb zero, 57(sp) -; RV64I-NEXT: sb zero, 56(sp) -; RV64I-NEXT: sb t2, 95(sp) -; RV64I-NEXT: sb t1, 94(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 93(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 92(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 91(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 90(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 88(sp) -; RV64I-NEXT: slli a0, t0, 56 -; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a1, sp, 88 -; RV64I-NEXT: sub a0, a1, a0 -; RV64I-NEXT: lbu a1, 9(a0) -; RV64I-NEXT: lbu a3, 8(a0) -; RV64I-NEXT: lbu a4, 10(a0) -; RV64I-NEXT: lbu a5, 11(a0) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: lbu a3, 13(a0) -; RV64I-NEXT: lbu a4, 12(a0) -; RV64I-NEXT: lbu a5, 14(a0) -; RV64I-NEXT: lbu a6, 15(a0) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a1 -; RV64I-NEXT: andi a1, t0, 7 -; RV64I-NEXT: lbu a4, 1(a0) -; RV64I-NEXT: lbu a5, 0(a0) -; RV64I-NEXT: lbu a6, 2(a0) -; RV64I-NEXT: lbu a7, 3(a0) +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 5(a0) -; RV64I-NEXT: lbu a6, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu t0, 7(a0) +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -2297,20 +1768,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 25(a0) -; RV64I-NEXT: lbu a6, 24(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu t0, 27(a0) +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 29(a0) -; RV64I-NEXT: lbu a7, 28(a0) -; RV64I-NEXT: lbu t0, 30(a0) -; RV64I-NEXT: lbu t1, 31(a0) +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 @@ -2319,439 +1790,353 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu a7, 16(a0) -; RV64I-NEXT: lbu t0, 18(a0) -; RV64I-NEXT: lbu t1, 19(a0) +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: lbu a7, 21(a0) -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: lbu t0, 20(a0) +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t1, 22(a0) -; RV64I-NEXT: lbu a0, 23(a0) ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: srli t0, a4, 1 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or t1, a0, t1 -; RV64I-NEXT: xori t2, a1, 63 -; RV64I-NEXT: srl a0, t0, t2 -; RV64I-NEXT: or a7, t1, a7 -; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 1(a1) +; RV64I-NEXT: lbu a7, 0(a1) +; RV64I-NEXT: lbu t0, 2(a1) +; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: srli a7, a6, 1 -; RV64I-NEXT: srl a7, a7, t2 +; RV64I-NEXT: lbu a7, 5(a1) +; RV64I-NEXT: lbu t0, 4(a1) +; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a7, a7, 8 +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, t1 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or a1, a1, a6 +; RV64I-NEXT: sd zero, 24(sp) +; RV64I-NEXT: sd zero, 16(sp) +; RV64I-NEXT: sd zero, 8(sp) +; RV64I-NEXT: sd zero, 0(sp) +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a5, 48(sp) +; RV64I-NEXT: sd a4, 40(sp) +; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: addi a3, sp, 32 +; RV64I-NEXT: sub a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: ld a5, 0(a3) +; RV64I-NEXT: sll a0, a4, a1 +; RV64I-NEXT: andi a6, a1, 63 +; RV64I-NEXT: xori a6, a6, 63 +; RV64I-NEXT: srli a7, a5, 1 +; RV64I-NEXT: ld t0, 24(a3) +; RV64I-NEXT: ld a3, 16(a3) +; RV64I-NEXT: srl a7, a7, a6 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: sll a7, t0, a1 ; RV64I-NEXT: srli t0, a3, 1 -; RV64I-NEXT: not t1, a1 -; RV64I-NEXT: srl t0, t0, t1 +; RV64I-NEXT: srl t0, t0, a6 +; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: sll a3, a3, a1 -; RV64I-NEXT: sll a5, a5, a1 -; RV64I-NEXT: sll a6, a6, a1 -; RV64I-NEXT: sll a1, a4, a1 -; RV64I-NEXT: srli a4, a6, 56 -; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a6, 48 -; RV64I-NEXT: sb a4, 22(a2) -; RV64I-NEXT: srli a4, a6, 40 -; RV64I-NEXT: sb a4, 21(a2) -; RV64I-NEXT: srli a4, a6, 32 -; RV64I-NEXT: sb a4, 20(a2) -; RV64I-NEXT: srli a4, a6, 24 -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a4, a6, 16 -; RV64I-NEXT: sb a4, 18(a2) -; RV64I-NEXT: or a4, a6, t0 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: srli a6, a5, 56 -; RV64I-NEXT: sb a6, 31(a2) -; RV64I-NEXT: srli a6, a5, 48 -; RV64I-NEXT: sb a6, 30(a2) -; RV64I-NEXT: srli a6, a5, 40 -; RV64I-NEXT: sb a6, 29(a2) -; RV64I-NEXT: srli a6, a5, 32 -; RV64I-NEXT: sb a6, 28(a2) -; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: sb a6, 27(a2) -; RV64I-NEXT: srli a6, a5, 16 -; RV64I-NEXT: sb a6, 26(a2) -; RV64I-NEXT: or a6, a5, a7 -; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 25(a2) -; RV64I-NEXT: srli a5, a1, 56 -; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: srli a5, a1, 48 -; RV64I-NEXT: sb a5, 6(a2) -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: srli a5, a1, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: srli a5, a1, 24 -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a5, a1, 16 -; RV64I-NEXT: sb a5, 2(a2) +; RV64I-NEXT: srli a4, a4, 1 +; RV64I-NEXT: srl a4, a4, a6 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: sll a1, a5, a1 ; RV64I-NEXT: sb a1, 0(a2) +; RV64I-NEXT: srli a4, a1, 56 +; RV64I-NEXT: sb a4, 7(a2) +; RV64I-NEXT: srli a4, a1, 48 +; RV64I-NEXT: sb a4, 6(a2) +; RV64I-NEXT: srli a4, a1, 40 +; RV64I-NEXT: sb a4, 5(a2) +; RV64I-NEXT: srli a4, a1, 32 +; RV64I-NEXT: sb a4, 4(a2) +; RV64I-NEXT: srli a4, a1, 24 +; RV64I-NEXT: sb a4, 3(a2) +; RV64I-NEXT: srli a4, a1, 16 +; RV64I-NEXT: sb a4, 2(a2) ; RV64I-NEXT: srli a1, a1, 8 ; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: sb a3, 16(a2) +; RV64I-NEXT: sb a7, 24(a2) +; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: srli a1, a3, 56 -; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: sb a1, 23(a2) ; RV64I-NEXT: srli a1, a3, 48 -; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: sb a1, 22(a2) ; RV64I-NEXT: srli a1, a3, 40 -; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: sb a1, 21(a2) ; RV64I-NEXT: srli a1, a3, 32 -; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: sb a1, 20(a2) ; RV64I-NEXT: srli a1, a3, 24 -; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: sb a1, 19(a2) ; RV64I-NEXT: srli a1, a3, 16 -; RV64I-NEXT: sb a1, 10(a2) -; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: sb a1, 18(a2) ; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 9(a2) -; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: sb a6, 24(a2) -; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 224 +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: srli a1, a7, 56 +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a7, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, a7, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, a7, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, a7, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, a7, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: srli a1, a7, 8 +; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -144 -; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -64 ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) -; RV32I-NEXT: lbu s10, 1(a1) -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s11, 0(a1) -; RV32I-NEXT: slli s10, s10, 8 -; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or s10, s10, s11 -; RV32I-NEXT: lbu s11, 22(a0) -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, ra -; RV32I-NEXT: lbu ra, 23(a0) -; RV32I-NEXT: or t0, a1, s10 -; RV32I-NEXT: lbu s10, 24(a0) -; RV32I-NEXT: lbu a7, 25(a0) -; RV32I-NEXT: lbu a6, 26(a0) -; RV32I-NEXT: lbu a5, 27(a0) -; RV32I-NEXT: lbu a1, 31(a0) -; RV32I-NEXT: lbu a3, 30(a0) -; RV32I-NEXT: lbu a4, 29(a0) -; RV32I-NEXT: lbu a0, 28(a0) -; RV32I-NEXT: sb a1, 91(sp) -; RV32I-NEXT: sb a3, 90(sp) -; RV32I-NEXT: sb a4, 89(sp) -; RV32I-NEXT: sb a0, 88(sp) -; RV32I-NEXT: sb a5, 87(sp) -; RV32I-NEXT: sb a6, 86(sp) -; RV32I-NEXT: sb a7, 85(sp) -; RV32I-NEXT: sb s10, 84(sp) -; RV32I-NEXT: sb ra, 83(sp) -; RV32I-NEXT: sb s11, 82(sp) -; RV32I-NEXT: sb s9, 81(sp) -; RV32I-NEXT: sb s8, 80(sp) -; RV32I-NEXT: sb s7, 79(sp) -; RV32I-NEXT: sb s6, 78(sp) -; RV32I-NEXT: sb s5, 77(sp) -; RV32I-NEXT: sb s4, 76(sp) -; RV32I-NEXT: sb zero, 59(sp) -; RV32I-NEXT: sb zero, 58(sp) -; RV32I-NEXT: sb zero, 57(sp) -; RV32I-NEXT: sb zero, 56(sp) -; RV32I-NEXT: sb zero, 55(sp) -; RV32I-NEXT: sb zero, 54(sp) -; RV32I-NEXT: sb zero, 53(sp) -; RV32I-NEXT: sb zero, 52(sp) -; RV32I-NEXT: sb zero, 51(sp) -; RV32I-NEXT: sb zero, 50(sp) -; RV32I-NEXT: sb zero, 49(sp) -; RV32I-NEXT: sb zero, 48(sp) -; RV32I-NEXT: sb zero, 47(sp) -; RV32I-NEXT: sb zero, 46(sp) -; RV32I-NEXT: sb zero, 45(sp) -; RV32I-NEXT: sb zero, 44(sp) -; RV32I-NEXT: sb zero, 43(sp) -; RV32I-NEXT: sb zero, 42(sp) -; RV32I-NEXT: sb zero, 41(sp) -; RV32I-NEXT: sb zero, 40(sp) -; RV32I-NEXT: sb zero, 39(sp) -; RV32I-NEXT: sb zero, 38(sp) -; RV32I-NEXT: sb zero, 37(sp) -; RV32I-NEXT: sb zero, 36(sp) -; RV32I-NEXT: sb zero, 35(sp) -; RV32I-NEXT: sb zero, 34(sp) -; RV32I-NEXT: sb zero, 33(sp) -; RV32I-NEXT: sb zero, 32(sp) -; RV32I-NEXT: sb zero, 31(sp) -; RV32I-NEXT: sb zero, 30(sp) -; RV32I-NEXT: sb zero, 29(sp) -; RV32I-NEXT: sb zero, 28(sp) -; RV32I-NEXT: sb s3, 75(sp) -; RV32I-NEXT: sb s2, 74(sp) -; RV32I-NEXT: sb s1, 73(sp) -; RV32I-NEXT: sb s0, 72(sp) -; RV32I-NEXT: sb t6, 71(sp) -; RV32I-NEXT: sb t5, 70(sp) -; RV32I-NEXT: sb t4, 69(sp) -; RV32I-NEXT: sb t3, 68(sp) -; RV32I-NEXT: sb t2, 67(sp) -; RV32I-NEXT: sb t1, 66(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 65(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 64(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 63(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 62(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 60(sp) -; RV32I-NEXT: slli a0, t0, 24 -; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 60 -; RV32I-NEXT: sub a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a1, t0, 7 -; RV32I-NEXT: lbu a0, 1(a4) -; RV32I-NEXT: lbu a3, 0(a4) -; RV32I-NEXT: lbu a5, 2(a4) -; RV32I-NEXT: lbu a6, 3(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) +; RV32I-NEXT: slli a3, a3, 8 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a6, a5 -; RV32I-NEXT: or a6, a3, a0 -; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: xori a7, a1, 31 -; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: lbu a3, 13(a4) -; RV32I-NEXT: lbu a5, 12(a4) -; RV32I-NEXT: lbu t0, 14(a4) -; RV32I-NEXT: lbu t1, 15(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 24 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, t0 -; RV32I-NEXT: or t0, a5, a3 -; RV32I-NEXT: lbu a3, 9(a4) -; RV32I-NEXT: lbu a5, 8(a4) -; RV32I-NEXT: lbu t1, 10(a4) -; RV32I-NEXT: lbu t2, 11(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a5, t2, t1 -; RV32I-NEXT: or t1, a5, a3 -; RV32I-NEXT: srli a3, t1, 1 -; RV32I-NEXT: srl a5, a3, a7 -; RV32I-NEXT: srli t4, t5, 1 -; RV32I-NEXT: not t2, a1 -; RV32I-NEXT: lbu a3, 21(a4) -; RV32I-NEXT: lbu t3, 20(a4) -; RV32I-NEXT: lbu t6, 22(a4) -; RV32I-NEXT: lbu s0, 23(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t3, s0, t6 -; RV32I-NEXT: or t3, t3, a3 -; RV32I-NEXT: lbu a3, 17(a4) -; RV32I-NEXT: lbu t6, 16(a4) -; RV32I-NEXT: lbu s0, 18(a4) -; RV32I-NEXT: lbu s1, 19(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t6 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a3 -; RV32I-NEXT: lbu a3, 29(a4) -; RV32I-NEXT: lbu t6, 28(a4) -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu s2, 31(a4) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, t6 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: lbu s1, 25(a4) -; RV32I-NEXT: lbu s2, 24(a4) -; RV32I-NEXT: srl t4, t4, t2 -; RV32I-NEXT: or t6, t6, a3 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or a3, s1, s2 -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu a4, 27(a4) -; RV32I-NEXT: srli s2, s0, 1 -; RV32I-NEXT: srl s2, s2, a7 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: srli s1, t0, 1 -; RV32I-NEXT: srl s1, s1, t2 -; RV32I-NEXT: or a4, a4, a3 -; RV32I-NEXT: srli a3, a4, 1 -; RV32I-NEXT: srl a7, a3, a7 -; RV32I-NEXT: srli a3, t3, 1 -; RV32I-NEXT: srl t2, a3, t2 -; RV32I-NEXT: sll a3, t5, a1 -; RV32I-NEXT: sll t0, t0, a1 -; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t3, t3, a1 -; RV32I-NEXT: sll t5, s0, a1 -; RV32I-NEXT: sll t6, t6, a1 -; RV32I-NEXT: sll a4, a4, a1 -; RV32I-NEXT: sll a1, a6, a1 -; RV32I-NEXT: srli a6, a4, 24 -; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a4, 16 -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a4, t2 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu a7, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t1, t1, a7 +; RV32I-NEXT: lbu a7, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t2 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t2, t2, a7 +; RV32I-NEXT: lbu a7, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or a0, a0, t4 +; RV32I-NEXT: or a0, a0, a7 +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t3, 0(a1) +; RV32I-NEXT: lbu t4, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t3 +; RV32I-NEXT: slli t4, t4, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t4 +; RV32I-NEXT: or a7, a1, a7 +; RV32I-NEXT: sw zero, 28(sp) +; RV32I-NEXT: sw zero, 24(sp) +; RV32I-NEXT: sw zero, 20(sp) +; RV32I-NEXT: sw zero, 16(sp) +; RV32I-NEXT: sw zero, 12(sp) +; RV32I-NEXT: sw zero, 8(sp) +; RV32I-NEXT: sw zero, 4(sp) +; RV32I-NEXT: sw zero, 0(sp) +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw t2, 56(sp) +; RV32I-NEXT: sw t1, 52(sp) +; RV32I-NEXT: sw t0, 48(sp) +; RV32I-NEXT: sw a6, 44(sp) +; RV32I-NEXT: sw a5, 40(sp) +; RV32I-NEXT: sw a4, 36(sp) +; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: srli a0, a7, 3 +; RV32I-NEXT: andi a0, a0, 28 +; RV32I-NEXT: addi a1, sp, 32 +; RV32I-NEXT: sub a4, a1, a0 +; RV32I-NEXT: lw a3, 4(a4) +; RV32I-NEXT: lw a5, 0(a4) +; RV32I-NEXT: sll a0, a3, a7 +; RV32I-NEXT: andi a1, a7, 31 +; RV32I-NEXT: xori a6, a1, 31 +; RV32I-NEXT: srli a1, a5, 1 +; RV32I-NEXT: lw t0, 12(a4) +; RV32I-NEXT: lw t1, 8(a4) +; RV32I-NEXT: srl a1, a1, a6 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: sll a1, t0, a7 +; RV32I-NEXT: srli t2, t1, 1 +; RV32I-NEXT: srl t2, t2, a6 +; RV32I-NEXT: or a1, a1, t2 +; RV32I-NEXT: sll t1, t1, a7 +; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: lw t2, 20(a4) +; RV32I-NEXT: lw t3, 16(a4) +; RV32I-NEXT: srl a3, a3, a6 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: sll t1, t2, a7 +; RV32I-NEXT: srli t4, t3, 1 +; RV32I-NEXT: srl t4, t4, a6 +; RV32I-NEXT: or t1, t1, t4 +; RV32I-NEXT: sll t3, t3, a7 +; RV32I-NEXT: srli t0, t0, 1 +; RV32I-NEXT: lw t4, 28(a4) +; RV32I-NEXT: lw a4, 24(a4) +; RV32I-NEXT: srl t0, t0, a6 +; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: sll t3, t4, a7 +; RV32I-NEXT: srli t4, a4, 1 +; RV32I-NEXT: srl t4, t4, a6 +; RV32I-NEXT: or t3, t3, t4 +; RV32I-NEXT: sll a4, a4, a7 +; RV32I-NEXT: srli t2, t2, 1 +; RV32I-NEXT: srl a6, t2, a6 +; RV32I-NEXT: or a4, a4, a6 +; RV32I-NEXT: sll a5, a5, a7 +; RV32I-NEXT: sb a5, 0(a2) +; RV32I-NEXT: srli a6, a5, 24 +; RV32I-NEXT: sb a6, 3(a2) +; RV32I-NEXT: srli a6, a5, 16 +; RV32I-NEXT: sb a6, 2(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) +; RV32I-NEXT: sb a4, 24(a2) +; RV32I-NEXT: sb t3, 28(a2) +; RV32I-NEXT: sb t0, 16(a2) +; RV32I-NEXT: sb t1, 20(a2) +; RV32I-NEXT: sb a3, 8(a2) +; RV32I-NEXT: sb a1, 12(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a5, a4, 24 +; RV32I-NEXT: sb a5, 27(a2) +; RV32I-NEXT: srli a5, a4, 16 +; RV32I-NEXT: sb a5, 26(a2) ; RV32I-NEXT: srli a4, a4, 8 ; RV32I-NEXT: sb a4, 25(a2) -; RV32I-NEXT: srli a4, t6, 24 +; RV32I-NEXT: srli a4, t3, 24 ; RV32I-NEXT: sb a4, 31(a2) -; RV32I-NEXT: srli a4, t6, 16 +; RV32I-NEXT: srli a4, t3, 16 ; RV32I-NEXT: sb a4, 30(a2) -; RV32I-NEXT: or a4, t6, a7 -; RV32I-NEXT: srli a7, t6, 8 -; RV32I-NEXT: sb a7, 29(a2) -; RV32I-NEXT: srli a7, t5, 24 -; RV32I-NEXT: sb a7, 19(a2) -; RV32I-NEXT: srli a7, t5, 16 -; RV32I-NEXT: sb a7, 18(a2) -; RV32I-NEXT: or a7, t5, s1 -; RV32I-NEXT: srli t2, t5, 8 -; RV32I-NEXT: sb t2, 17(a2) -; RV32I-NEXT: srli t2, t3, 24 -; RV32I-NEXT: sb t2, 23(a2) -; RV32I-NEXT: srli t2, t3, 16 -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: or t2, t3, s2 -; RV32I-NEXT: srli t3, t3, 8 -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, t1, 24 -; RV32I-NEXT: sb t3, 11(a2) -; RV32I-NEXT: srli t3, t1, 16 -; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, t1, t4 -; RV32I-NEXT: srli t1, t1, 8 -; RV32I-NEXT: sb t1, 9(a2) -; RV32I-NEXT: srli t1, t0, 24 -; RV32I-NEXT: sb t1, 15(a2) -; RV32I-NEXT: srli t1, t0, 16 -; RV32I-NEXT: sb t1, 14(a2) -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: srli t0, t0, 8 -; RV32I-NEXT: sb t0, 13(a2) -; RV32I-NEXT: srli t0, a1, 24 -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: srli t0, a1, 16 -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: srli a4, t3, 8 +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: srli a4, t0, 24 +; RV32I-NEXT: sb a4, 19(a2) +; RV32I-NEXT: srli a4, t0, 16 +; RV32I-NEXT: sb a4, 18(a2) +; RV32I-NEXT: srli a4, t0, 8 +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: srli a4, t1, 24 +; RV32I-NEXT: sb a4, 23(a2) +; RV32I-NEXT: srli a4, t1, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: srli a4, t1, 8 +; RV32I-NEXT: sb a4, 21(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 14(a2) ; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, a3, 24 +; RV32I-NEXT: sb a1, 13(a2) +; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, a3, 16 +; RV32I-NEXT: srli a1, a0, 16 ; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 5(a2) -; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb a4, 28(a2) -; RV32I-NEXT: sb a7, 16(a2) -; RV32I-NEXT: sb t2, 20(a2) -; RV32I-NEXT: sb t3, 8(a2) -; RV32I-NEXT: sb a5, 12(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 144 +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 @@ -2762,200 +2147,43 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -224 -; RV64I-NEXT: sd ra, 216(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 208(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 200(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 192(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 184(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 176(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 168(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 160(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t1, 31(a0) -; RV64I-NEXT: lbu a3, 0(a0) -; RV64I-NEXT: sd a3, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -64 ; RV64I-NEXT: lbu a3, 1(a0) -; RV64I-NEXT: sd a3, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 2(a0) -; RV64I-NEXT: sd a3, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 3(a0) -; RV64I-NEXT: sd a3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: sd a3, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a3, 5(a0) -; RV64I-NEXT: sd a3, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu t3, 6(a0) -; RV64I-NEXT: lbu t4, 7(a0) -; RV64I-NEXT: lbu t5, 8(a0) -; RV64I-NEXT: lbu t6, 9(a0) -; RV64I-NEXT: lbu s0, 10(a0) -; RV64I-NEXT: lbu s1, 11(a0) -; RV64I-NEXT: lbu s2, 12(a0) -; RV64I-NEXT: lbu s3, 13(a0) -; RV64I-NEXT: lbu s4, 14(a0) -; RV64I-NEXT: lbu s5, 15(a0) -; RV64I-NEXT: lbu s6, 16(a0) -; RV64I-NEXT: lbu s7, 17(a0) -; RV64I-NEXT: lbu s8, 18(a0) -; RV64I-NEXT: lbu s9, 19(a0) -; RV64I-NEXT: lbu a3, 1(a1) -; RV64I-NEXT: lbu s10, 0(a1) -; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu ra, 3(a1) -; RV64I-NEXT: slli a3, a3, 8 -; RV64I-NEXT: or a3, a3, s10 -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: or s11, ra, s11 -; RV64I-NEXT: or a3, s11, a3 -; RV64I-NEXT: lbu s11, 4(a1) -; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: lbu ra, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or s10, s10, s11 -; RV64I-NEXT: lbu s11, 20(a0) -; RV64I-NEXT: slli ra, ra, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, ra -; RV64I-NEXT: lbu ra, 21(a0) -; RV64I-NEXT: or a1, a1, s10 -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or t2, a1, a3 -; RV64I-NEXT: lbu t0, 23(a0) -; RV64I-NEXT: lbu a7, 24(a0) -; RV64I-NEXT: lbu a6, 25(a0) -; RV64I-NEXT: lbu a5, 26(a0) -; RV64I-NEXT: lbu a1, 30(a0) -; RV64I-NEXT: lbu a3, 29(a0) -; RV64I-NEXT: lbu a4, 28(a0) -; RV64I-NEXT: lbu a0, 27(a0) -; RV64I-NEXT: sb a1, 86(sp) -; RV64I-NEXT: sb a3, 85(sp) -; RV64I-NEXT: sb a4, 84(sp) -; RV64I-NEXT: sb a0, 83(sp) -; RV64I-NEXT: sb a5, 82(sp) -; RV64I-NEXT: sb a6, 81(sp) -; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb t0, 79(sp) -; RV64I-NEXT: sb s10, 78(sp) -; RV64I-NEXT: sb ra, 77(sp) -; RV64I-NEXT: sb s11, 76(sp) -; RV64I-NEXT: sb s9, 75(sp) -; RV64I-NEXT: sb s8, 74(sp) -; RV64I-NEXT: sb s7, 73(sp) -; RV64I-NEXT: sb s6, 72(sp) -; RV64I-NEXT: sb s5, 71(sp) -; RV64I-NEXT: sb s4, 70(sp) -; RV64I-NEXT: sb s3, 69(sp) -; RV64I-NEXT: sb s2, 68(sp) -; RV64I-NEXT: sb s1, 67(sp) -; RV64I-NEXT: sb s0, 66(sp) -; RV64I-NEXT: sb t6, 65(sp) -; RV64I-NEXT: sb t5, 64(sp) -; RV64I-NEXT: sb t1, 87(sp) -; RV64I-NEXT: slli t1, t1, 56 -; RV64I-NEXT: sb t4, 63(sp) -; RV64I-NEXT: sb t3, 62(sp) -; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 61(sp) -; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 60(sp) -; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 59(sp) -; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 58(sp) -; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 57(sp) -; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 56(sp) -; RV64I-NEXT: srai a0, t1, 63 -; RV64I-NEXT: sb a0, 112(sp) -; RV64I-NEXT: sb a0, 104(sp) -; RV64I-NEXT: sb a0, 96(sp) -; RV64I-NEXT: sb a0, 88(sp) -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: sb a1, 119(sp) -; RV64I-NEXT: srli a3, a0, 48 -; RV64I-NEXT: sb a3, 118(sp) -; RV64I-NEXT: srli a4, a0, 40 -; RV64I-NEXT: sb a4, 117(sp) -; RV64I-NEXT: srli a5, a0, 32 -; RV64I-NEXT: sb a5, 116(sp) -; RV64I-NEXT: srli a6, a0, 24 -; RV64I-NEXT: sb a6, 115(sp) -; RV64I-NEXT: srli a7, a0, 16 -; RV64I-NEXT: sb a7, 114(sp) -; RV64I-NEXT: srli a0, a0, 8 -; RV64I-NEXT: sb a0, 113(sp) -; RV64I-NEXT: sb a1, 111(sp) -; RV64I-NEXT: sb a3, 110(sp) -; RV64I-NEXT: sb a4, 109(sp) -; RV64I-NEXT: sb a5, 108(sp) -; RV64I-NEXT: sb a6, 107(sp) -; RV64I-NEXT: sb a7, 106(sp) -; RV64I-NEXT: sb a0, 105(sp) -; RV64I-NEXT: sb a1, 103(sp) -; RV64I-NEXT: sb a3, 102(sp) -; RV64I-NEXT: sb a4, 101(sp) -; RV64I-NEXT: sb a5, 100(sp) -; RV64I-NEXT: sb a6, 99(sp) -; RV64I-NEXT: sb a7, 98(sp) -; RV64I-NEXT: sb a0, 97(sp) -; RV64I-NEXT: sb a1, 95(sp) -; RV64I-NEXT: sb a3, 94(sp) -; RV64I-NEXT: sb a4, 93(sp) -; RV64I-NEXT: sb a5, 92(sp) -; RV64I-NEXT: sb a6, 91(sp) -; RV64I-NEXT: sb a7, 90(sp) -; RV64I-NEXT: sb a0, 89(sp) -; RV64I-NEXT: slli a0, t2, 56 -; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a1, sp, 56 -; RV64I-NEXT: add a1, a1, a0 -; RV64I-NEXT: lbu a0, 9(a1) -; RV64I-NEXT: lbu a3, 8(a1) -; RV64I-NEXT: lbu a4, 10(a1) -; RV64I-NEXT: lbu a5, 11(a1) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: slli a4, a4, 16 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a3, 13(a1) -; RV64I-NEXT: lbu a4, 12(a1) -; RV64I-NEXT: lbu a5, 14(a1) -; RV64I-NEXT: lbu a6, 15(a1) +; RV64I-NEXT: lbu a4, 0(a0) +; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a4, a3, a0 -; RV64I-NEXT: andi a3, t2, 7 -; RV64I-NEXT: lbu a0, 17(a1) -; RV64I-NEXT: lbu a5, 16(a1) -; RV64I-NEXT: lbu a6, 18(a1) -; RV64I-NEXT: lbu a7, 19(a1) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: lbu a4, 5(a0) +; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a1) -; RV64I-NEXT: lbu a6, 20(a1) -; RV64I-NEXT: lbu a7, 22(a1) -; RV64I-NEXT: lbu t0, 23(a1) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -2963,467 +2191,378 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a5, a5, a0 -; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a3 -; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a1) -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 2(a1) -; RV64I-NEXT: lbu t1, 3(a1) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a1) -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: lbu t1, 6(a1) -; RV64I-NEXT: lbu t2, 7(a1) +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 25(a0) +; RV64I-NEXT: lbu a7, 24(a0) +; RV64I-NEXT: lbu t0, 26(a0) +; RV64I-NEXT: lbu t1, 27(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: lbu a7, 29(a0) +; RV64I-NEXT: lbu t0, 28(a0) +; RV64I-NEXT: lbu t1, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t1 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a7, a0, 32 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a1) -; RV64I-NEXT: lbu t0, 24(a1) -; RV64I-NEXT: lbu t1, 26(a1) -; RV64I-NEXT: lbu t2, 27(a1) +; RV64I-NEXT: lbu a7, 1(a1) +; RV64I-NEXT: lbu t0, 0(a1) +; RV64I-NEXT: lbu t1, 2(a1) +; RV64I-NEXT: lbu t2, 3(a1) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a1) -; RV64I-NEXT: lbu t1, 28(a1) -; RV64I-NEXT: lbu t2, 30(a1) -; RV64I-NEXT: lbu a1, 31(a1) +; RV64I-NEXT: lbu t0, 5(a1) +; RV64I-NEXT: lbu t1, 4(a1) +; RV64I-NEXT: lbu t2, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a1, a1, 24 ; RV64I-NEXT: or a1, a1, t2 -; RV64I-NEXT: slli t1, a4, 1 ; RV64I-NEXT: or a1, a1, t0 -; RV64I-NEXT: xori t0, a3, 63 -; RV64I-NEXT: sll t1, t1, t0 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a7, a1, a7 -; RV64I-NEXT: slli a1, a7, 1 -; RV64I-NEXT: sll t0, a1, t0 -; RV64I-NEXT: srl a1, a4, a3 -; RV64I-NEXT: srl a4, a6, a3 -; RV64I-NEXT: srl a5, a5, a3 -; RV64I-NEXT: sra a3, a7, a3 -; RV64I-NEXT: srli a6, a5, 48 -; RV64I-NEXT: sb a6, 22(a2) -; RV64I-NEXT: srli a6, a5, 40 -; RV64I-NEXT: sb a6, 21(a2) -; RV64I-NEXT: srli a6, a5, 32 -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: sb a6, 19(a2) -; RV64I-NEXT: srli a6, a5, 16 -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: or a6, a5, t0 +; RV64I-NEXT: or a1, a1, a7 +; RV64I-NEXT: sraiw a0, a0, 31 +; RV64I-NEXT: sd a0, 56(sp) +; RV64I-NEXT: sd a0, 48(sp) +; RV64I-NEXT: sd a0, 40(sp) +; RV64I-NEXT: sd a0, 32(sp) +; RV64I-NEXT: sd a6, 24(sp) +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a4, 8(sp) +; RV64I-NEXT: sd a3, 0(sp) +; RV64I-NEXT: srli a0, a1, 3 +; RV64I-NEXT: andi a0, a0, 24 +; RV64I-NEXT: mv a3, sp +; RV64I-NEXT: add a3, a3, a0 +; RV64I-NEXT: ld a4, 8(a3) +; RV64I-NEXT: srl a0, a4, a1 +; RV64I-NEXT: ld a5, 16(a3) +; RV64I-NEXT: andi a6, a1, 63 +; RV64I-NEXT: xori a6, a6, 63 +; RV64I-NEXT: ld a7, 0(a3) +; RV64I-NEXT: slli t0, a5, 1 +; RV64I-NEXT: sll t0, t0, a6 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: srl a7, a7, a1 +; RV64I-NEXT: slli a4, a4, 1 +; RV64I-NEXT: ld a3, 24(a3) +; RV64I-NEXT: sll a4, a4, a6 +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: srl a5, a5, a1 +; RV64I-NEXT: slli a7, a3, 1 +; RV64I-NEXT: sll a6, a7, a6 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: sra a1, a3, a1 +; RV64I-NEXT: sb a1, 24(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 31(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 30(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 28(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 27(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 26(a2) +; RV64I-NEXT: srli a1, a1, 8 +; RV64I-NEXT: sb a1, 25(a2) ; RV64I-NEXT: sb a5, 16(a2) +; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: srli a1, a5, 56 +; RV64I-NEXT: sb a1, 23(a2) +; RV64I-NEXT: srli a1, a5, 48 +; RV64I-NEXT: sb a1, 22(a2) +; RV64I-NEXT: srli a1, a5, 40 +; RV64I-NEXT: sb a1, 21(a2) +; RV64I-NEXT: srli a1, a5, 32 +; RV64I-NEXT: sb a1, 20(a2) +; RV64I-NEXT: srli a1, a5, 24 +; RV64I-NEXT: sb a1, 19(a2) +; RV64I-NEXT: srli a1, a5, 16 +; RV64I-NEXT: sb a1, 18(a2) ; RV64I-NEXT: srli a5, a5, 8 ; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a3, 56 -; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a3, 48 -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a3, 40 -; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a3, 32 -; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a3, 24 -; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a3, 16 -; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a3, 24(a2) -; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 25(a2) -; RV64I-NEXT: srli a3, a4, 48 -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: srli a3, a4, 40 -; RV64I-NEXT: sb a3, 5(a2) -; RV64I-NEXT: srli a3, a4, 32 -; RV64I-NEXT: sb a3, 4(a2) -; RV64I-NEXT: srli a3, a4, 24 -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a3, a4, 16 -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: or a3, a4, t1 -; RV64I-NEXT: sb a4, 0(a2) +; RV64I-NEXT: srli a1, a4, 56 +; RV64I-NEXT: sb a1, 7(a2) +; RV64I-NEXT: srli a1, a4, 48 +; RV64I-NEXT: sb a1, 6(a2) +; RV64I-NEXT: srli a1, a4, 40 +; RV64I-NEXT: sb a1, 5(a2) +; RV64I-NEXT: srli a1, a4, 32 +; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a4, 24 +; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a4, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a4, a4, 8 ; RV64I-NEXT: sb a4, 1(a2) -; RV64I-NEXT: srli a4, a1, 48 -; RV64I-NEXT: sb a4, 14(a2) -; RV64I-NEXT: srli a4, a1, 40 -; RV64I-NEXT: sb a4, 13(a2) -; RV64I-NEXT: srli a4, a1, 32 -; RV64I-NEXT: sb a4, 12(a2) -; RV64I-NEXT: srli a4, a1, 24 -; RV64I-NEXT: sb a4, 11(a2) -; RV64I-NEXT: srli a4, a1, 16 -; RV64I-NEXT: sb a4, 10(a2) -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: sb a1, 8(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: srli a1, a6, 56 -; RV64I-NEXT: sb a1, 23(a2) -; RV64I-NEXT: srli a3, a3, 56 -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: sb a0, 15(a2) -; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 192(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 184(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 176(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 168(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 160(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 224 +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: addi sp, sp, 64 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -144 -; RV32I-NEXT: sw ra, 140(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 136(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 132(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 128(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 124(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 120(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 116(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 112(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 108(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 104(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s9, 100(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s10, 96(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s11, 92(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t3, 31(a0) -; RV32I-NEXT: lbu a3, 0(a0) -; RV32I-NEXT: sw a3, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: addi sp, sp, -64 ; RV32I-NEXT: lbu a3, 1(a0) -; RV32I-NEXT: sw a3, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 2(a0) -; RV32I-NEXT: sw a3, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 3(a0) -; RV32I-NEXT: sw a3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: sw a3, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a3, 5(a0) -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu t2, 6(a0) -; RV32I-NEXT: lbu t4, 7(a0) -; RV32I-NEXT: lbu t5, 8(a0) -; RV32I-NEXT: lbu t6, 9(a0) -; RV32I-NEXT: lbu s0, 10(a0) -; RV32I-NEXT: lbu s1, 11(a0) -; RV32I-NEXT: lbu s2, 12(a0) -; RV32I-NEXT: lbu s3, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu a3, 1(a1) -; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 0(a1) +; RV32I-NEXT: lbu a4, 0(a0) +; RV32I-NEXT: lbu a5, 2(a0) +; RV32I-NEXT: lbu a6, 3(a0) ; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: or a3, a3, s11 -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, ra -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: or t1, a1, a3 -; RV32I-NEXT: lbu t0, 23(a0) -; RV32I-NEXT: lbu a7, 24(a0) -; RV32I-NEXT: lbu a6, 25(a0) -; RV32I-NEXT: lbu a5, 26(a0) -; RV32I-NEXT: lbu a1, 30(a0) -; RV32I-NEXT: lbu a3, 29(a0) -; RV32I-NEXT: lbu a4, 28(a0) -; RV32I-NEXT: lbu a0, 27(a0) -; RV32I-NEXT: sb a1, 58(sp) -; RV32I-NEXT: sb a3, 57(sp) -; RV32I-NEXT: sb a4, 56(sp) -; RV32I-NEXT: sb a0, 55(sp) -; RV32I-NEXT: sb a5, 54(sp) -; RV32I-NEXT: sb a6, 53(sp) -; RV32I-NEXT: sb a7, 52(sp) -; RV32I-NEXT: sb t0, 51(sp) -; RV32I-NEXT: sb ra, 50(sp) -; RV32I-NEXT: sb s11, 49(sp) -; RV32I-NEXT: sb s10, 48(sp) -; RV32I-NEXT: sb s9, 47(sp) -; RV32I-NEXT: sb s8, 46(sp) -; RV32I-NEXT: sb s7, 45(sp) -; RV32I-NEXT: sb s6, 44(sp) -; RV32I-NEXT: sb s5, 43(sp) -; RV32I-NEXT: sb t3, 59(sp) -; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: sb s4, 42(sp) -; RV32I-NEXT: sb s3, 41(sp) -; RV32I-NEXT: sb s2, 40(sp) -; RV32I-NEXT: sb s1, 39(sp) -; RV32I-NEXT: sb s0, 38(sp) -; RV32I-NEXT: sb t6, 37(sp) -; RV32I-NEXT: sb t5, 36(sp) -; RV32I-NEXT: sb t4, 35(sp) -; RV32I-NEXT: sb t2, 34(sp) -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 33(sp) -; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 32(sp) -; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 31(sp) -; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 30(sp) -; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 29(sp) -; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 28(sp) -; RV32I-NEXT: srai a0, t3, 31 -; RV32I-NEXT: sb a0, 88(sp) -; RV32I-NEXT: sb a0, 84(sp) -; RV32I-NEXT: sb a0, 80(sp) -; RV32I-NEXT: sb a0, 76(sp) -; RV32I-NEXT: sb a0, 72(sp) -; RV32I-NEXT: sb a0, 68(sp) -; RV32I-NEXT: sb a0, 64(sp) -; RV32I-NEXT: sb a0, 60(sp) -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: sb a1, 91(sp) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 90(sp) -; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 89(sp) -; RV32I-NEXT: sb a1, 87(sp) -; RV32I-NEXT: sb a3, 86(sp) -; RV32I-NEXT: sb a0, 85(sp) -; RV32I-NEXT: sb a1, 83(sp) -; RV32I-NEXT: sb a3, 82(sp) -; RV32I-NEXT: sb a0, 81(sp) -; RV32I-NEXT: sb a1, 79(sp) -; RV32I-NEXT: sb a3, 78(sp) -; RV32I-NEXT: sb a0, 77(sp) -; RV32I-NEXT: sb a1, 75(sp) -; RV32I-NEXT: sb a3, 74(sp) -; RV32I-NEXT: sb a0, 73(sp) -; RV32I-NEXT: sb a1, 71(sp) -; RV32I-NEXT: sb a3, 70(sp) -; RV32I-NEXT: sb a0, 69(sp) -; RV32I-NEXT: sb a1, 67(sp) -; RV32I-NEXT: sb a3, 66(sp) -; RV32I-NEXT: sb a0, 65(sp) -; RV32I-NEXT: sb a1, 63(sp) -; RV32I-NEXT: sb a3, 62(sp) -; RV32I-NEXT: sb a0, 61(sp) -; RV32I-NEXT: slli a0, t1, 24 -; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a4, sp, 28 -; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 -; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or t5, a3, a0 -; RV32I-NEXT: andi a3, t1, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a6, a5 -; RV32I-NEXT: or a6, a1, a0 -; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t1, a3 -; RV32I-NEXT: sll a0, a0, t1 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t0, 3(a4) -; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: lbu a4, 5(a0) +; RV32I-NEXT: lbu a5, 4(a0) +; RV32I-NEXT: lbu a6, 6(a0) +; RV32I-NEXT: lbu a7, 7(a0) +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: lbu a5, 9(a0) +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 10(a0) +; RV32I-NEXT: lbu t0, 11(a0) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or t0, a5, a1 -; RV32I-NEXT: slli a1, t5, 1 -; RV32I-NEXT: xori t2, a3, 31 -; RV32I-NEXT: sll a1, a1, t2 -; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) -; RV32I-NEXT: lbu t3, 14(a4) -; RV32I-NEXT: lbu t4, 15(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: lbu a6, 13(a0) +; RV32I-NEXT: lbu a7, 12(a0) +; RV32I-NEXT: lbu t0, 14(a0) +; RV32I-NEXT: lbu t1, 15(a0) +; RV32I-NEXT: slli a6, a6, 8 +; RV32I-NEXT: or a6, a6, a7 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t1, t1, 24 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: lbu a7, 17(a0) +; RV32I-NEXT: lbu t0, 16(a0) +; RV32I-NEXT: lbu t1, 18(a0) +; RV32I-NEXT: lbu t2, 19(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t0 +; RV32I-NEXT: slli t1, t1, 16 +; RV32I-NEXT: slli t2, t2, 24 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or t0, t0, a7 +; RV32I-NEXT: lbu a7, 21(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 22(a0) +; RV32I-NEXT: lbu t3, 23(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t1 +; RV32I-NEXT: slli t2, t2, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t1, t1, a7 +; RV32I-NEXT: lbu a7, 25(a0) +; RV32I-NEXT: lbu t2, 24(a0) +; RV32I-NEXT: lbu t3, 26(a0) +; RV32I-NEXT: lbu t4, 27(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t2 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t3, a7, a5 -; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) -; RV32I-NEXT: lbu t4, 18(a4) -; RV32I-NEXT: lbu t6, 19(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: or t2, t2, a7 +; RV32I-NEXT: lbu a7, 29(a0) +; RV32I-NEXT: lbu t3, 28(a0) +; RV32I-NEXT: lbu t4, 30(a0) +; RV32I-NEXT: lbu a0, 31(a0) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t3 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t4 -; RV32I-NEXT: or t4, a7, a5 -; RV32I-NEXT: slli a5, t4, 1 -; RV32I-NEXT: sll a7, a5, t1 -; RV32I-NEXT: lbu a5, 21(a4) -; RV32I-NEXT: lbu t6, 20(a4) -; RV32I-NEXT: lbu s0, 22(a4) -; RV32I-NEXT: lbu s1, 23(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s0, s0, 16 -; RV32I-NEXT: slli s1, s1, 24 -; RV32I-NEXT: or s0, s1, s0 -; RV32I-NEXT: or s0, s0, a5 -; RV32I-NEXT: lbu a5, 25(a4) -; RV32I-NEXT: lbu t6, 24(a4) -; RV32I-NEXT: lbu s1, 26(a4) -; RV32I-NEXT: lbu s2, 27(a4) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, t6 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s2, s1 -; RV32I-NEXT: or t6, t6, a5 -; RV32I-NEXT: lbu a5, 29(a4) -; RV32I-NEXT: lbu s1, 28(a4) -; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t1, s2, t1 -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: lbu a4, 31(a4) -; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a4, a4, s1 -; RV32I-NEXT: slli s1, s0, 1 -; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: or s3, a4, a5 -; RV32I-NEXT: slli a4, s3, 1 -; RV32I-NEXT: sll t2, a4, t2 -; RV32I-NEXT: srl a4, t5, a3 -; RV32I-NEXT: srl a5, t0, a3 -; RV32I-NEXT: srl t0, t3, a3 -; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t3, s0, a3 -; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t5, t6, a3 -; RV32I-NEXT: sra a3, s3, a3 -; RV32I-NEXT: srli t6, t5, 16 -; RV32I-NEXT: sb t6, 26(a2) -; RV32I-NEXT: or t2, t5, t2 -; RV32I-NEXT: sb t5, 24(a2) -; RV32I-NEXT: srli t5, t5, 8 -; RV32I-NEXT: sb t5, 25(a2) -; RV32I-NEXT: srli t5, a3, 24 -; RV32I-NEXT: sb t5, 31(a2) -; RV32I-NEXT: srli t5, a3, 16 -; RV32I-NEXT: sb t5, 30(a2) -; RV32I-NEXT: sb a3, 28(a2) -; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, t4, 16 -; RV32I-NEXT: sb a3, 18(a2) -; RV32I-NEXT: or a3, t4, s1 -; RV32I-NEXT: sb t4, 16(a2) -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 17(a2) -; RV32I-NEXT: srli t4, t3, 16 -; RV32I-NEXT: sb t4, 22(a2) -; RV32I-NEXT: or t1, t3, t1 -; RV32I-NEXT: sb t3, 20(a2) -; RV32I-NEXT: srli t3, t3, 8 -; RV32I-NEXT: sb t3, 21(a2) -; RV32I-NEXT: srli t3, a6, 16 -; RV32I-NEXT: sb t3, 10(a2) -; RV32I-NEXT: or t3, a6, s2 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t0, 16 -; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: sb t0, 12(a2) -; RV32I-NEXT: srli a7, t0, 8 -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a5, 16 -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a5, a4, 16 -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: sb a4, 4(a2) +; RV32I-NEXT: slli a0, a0, 24 +; RV32I-NEXT: or t3, a0, t4 +; RV32I-NEXT: or t3, t3, a7 +; RV32I-NEXT: lbu a7, 1(a1) +; RV32I-NEXT: lbu t4, 0(a1) +; RV32I-NEXT: lbu t5, 2(a1) +; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: or a7, a7, t4 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, t5 +; RV32I-NEXT: or a7, a1, a7 +; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: sw a0, 60(sp) +; RV32I-NEXT: sw a0, 56(sp) +; RV32I-NEXT: sw a0, 52(sp) +; RV32I-NEXT: sw a0, 48(sp) +; RV32I-NEXT: sw a0, 44(sp) +; RV32I-NEXT: sw a0, 40(sp) +; RV32I-NEXT: sw a0, 36(sp) +; RV32I-NEXT: sw a0, 32(sp) +; RV32I-NEXT: sw t3, 28(sp) +; RV32I-NEXT: sw t2, 24(sp) +; RV32I-NEXT: sw t1, 20(sp) +; RV32I-NEXT: sw t0, 16(sp) +; RV32I-NEXT: sw a6, 12(sp) +; RV32I-NEXT: sw a5, 8(sp) +; RV32I-NEXT: sw a4, 4(sp) +; RV32I-NEXT: sw a3, 0(sp) +; RV32I-NEXT: srli a0, a7, 3 +; RV32I-NEXT: andi a0, a0, 28 +; RV32I-NEXT: mv a1, sp +; RV32I-NEXT: add a4, a1, a0 +; RV32I-NEXT: lw a1, 4(a4) +; RV32I-NEXT: srl a0, a1, a7 +; RV32I-NEXT: lw a5, 8(a4) +; RV32I-NEXT: andi a3, a7, 31 +; RV32I-NEXT: xori a6, a3, 31 +; RV32I-NEXT: lw a3, 0(a4) +; RV32I-NEXT: slli t0, a5, 1 +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or a0, a0, t0 +; RV32I-NEXT: srl a3, a3, a7 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: lw t0, 12(a4) +; RV32I-NEXT: lw t1, 16(a4) +; RV32I-NEXT: sll a1, a1, a6 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: srl a3, t0, a7 +; RV32I-NEXT: slli t2, t1, 1 +; RV32I-NEXT: sll t2, t2, a6 +; RV32I-NEXT: or a3, a3, t2 +; RV32I-NEXT: srl a5, a5, a7 +; RV32I-NEXT: slli t0, t0, 1 +; RV32I-NEXT: lw t2, 20(a4) +; RV32I-NEXT: lw t3, 24(a4) +; RV32I-NEXT: sll t0, t0, a6 +; RV32I-NEXT: or a5, a5, t0 +; RV32I-NEXT: srl t0, t2, a7 +; RV32I-NEXT: slli t4, t3, 1 +; RV32I-NEXT: sll t4, t4, a6 +; RV32I-NEXT: or t0, t0, t4 +; RV32I-NEXT: srl t1, t1, a7 +; RV32I-NEXT: slli t2, t2, 1 +; RV32I-NEXT: lw a4, 28(a4) +; RV32I-NEXT: sll t2, t2, a6 +; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: srl t2, t3, a7 +; RV32I-NEXT: slli t3, a4, 1 +; RV32I-NEXT: sll a6, t3, a6 +; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: sra a4, a4, a7 +; RV32I-NEXT: sb a4, 28(a2) +; RV32I-NEXT: srli a7, a4, 24 +; RV32I-NEXT: sb a7, 31(a2) +; RV32I-NEXT: srli a7, a4, 16 +; RV32I-NEXT: sb a7, 30(a2) ; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 5(a2) -; RV32I-NEXT: srli a4, t2, 24 +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: sb a6, 24(a2) +; RV32I-NEXT: sb t1, 16(a2) +; RV32I-NEXT: sb t0, 20(a2) +; RV32I-NEXT: sb a5, 8(a2) +; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: srli a4, a6, 24 ; RV32I-NEXT: sb a4, 27(a2) -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: sb a3, 19(a2) -; RV32I-NEXT: srli a3, t1, 24 -; RV32I-NEXT: sb a3, 23(a2) -; RV32I-NEXT: srli a3, t3, 24 -; RV32I-NEXT: sb a3, 11(a2) -; RV32I-NEXT: srli a3, a6, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 3(a2) -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: sb a0, 7(a2) -; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 128(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 124(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 120(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 116(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 112(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 108(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 104(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s9, 100(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s10, 96(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s11, 92(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 144 +; RV32I-NEXT: srli a4, a6, 16 +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: srli a4, a6, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t1, 24 +; RV32I-NEXT: sb a4, 19(a2) +; RV32I-NEXT: srli a4, t1, 16 +; RV32I-NEXT: sb a4, 18(a2) +; RV32I-NEXT: srli a4, t1, 8 +; RV32I-NEXT: sb a4, 17(a2) +; RV32I-NEXT: srli a4, t0, 24 +; RV32I-NEXT: sb a4, 23(a2) +; RV32I-NEXT: srli a4, t0, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: srli a4, t0, 8 +; RV32I-NEXT: sb a4, 21(a2) +; RV32I-NEXT: srli a4, a5, 24 +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a4, a5, 16 +; RV32I-NEXT: sb a4, 10(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 13(a2) +; RV32I-NEXT: srli a3, a1, 24 +; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: srli a1, a1, 8 +; RV32I-NEXT: sb a1, 1(a2) +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) +; RV32I-NEXT: srli a0, a0, 8 +; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: addi sp, sp, 64 ; RV32I-NEXT: ret %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll new file mode 100644 index 000000000000..52048a0a2065 --- /dev/null +++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll @@ -0,0 +1,415 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE +; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1 +; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX2 +; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512F +; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512BW + +define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind { +; SSE-LABEL: v_test_canonicalize__half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rbx) +; SSE-NEXT: addq $16, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize__half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $16, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: addq $16, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: v_test_canonicalize__half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $16, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: addq $16, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v_test_canonicalize__half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzwl (%rdi), %eax +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v_test_canonicalize__half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzwl (%rdi), %eax +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: movw %ax, (%rdi) +; AVX512BW-NEXT: retq +entry: + %val = load half, half addrspace(1)* %out + %canonicalized = call half @llvm.canonicalize.f16(half %val) + store half %canonicalized, half addrspace(1)* %out + ret void +} + +define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind { +; SSE-LABEL: complex_canonicalize_fmul_half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rax +; SSE-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: popq %rax +; SSE-NEXT: retq +; +; AVX1-LABEL: complex_canonicalize_fmul_half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX1-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: popq %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: complex_canonicalize_fmul_half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rax +; AVX2-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovss (%rsp), %xmm1 # 4-byte Reload +; AVX2-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: popq %rax +; AVX2-NEXT: retq +; +; AVX512F-LABEL: complex_canonicalize_fmul_half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpextrw $0, %xmm1, %eax +; AVX512F-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512F-NEXT: vmovd %eax, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: complex_canonicalize_fmul_half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpextrw $0, %xmm1, %eax +; AVX512BW-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vaddss %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm2 +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vmulss %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: retq +entry: + + %mul1 = fsub half %a, %b + %add = fadd half %mul1, %b + %mul2 = fsub half %add, %mul1 + %canonicalized = call half @llvm.canonicalize.f16(half %mul2) + %result = fsub half %canonicalized, %b + ret half %result +} + +define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind { +; SSE-LABEL: v_test_canonicalize_v2half: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbx +; SSE-NEXT: subq $48, %rsp +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: pinsrw $0, 2(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: callq __extendhfsf2@PLT +; SSE-NEXT: mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; SSE-NEXT: callq __truncsfhf2@PLT +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, 2(%rbx) +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rbx) +; SSE-NEXT: addq $48, %rsp +; SSE-NEXT: popq %rbx +; SSE-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_v2half: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $48, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX1-NEXT: callq __truncsfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: addq $48, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: v_test_canonicalize_v2half: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $48, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vpinsrw $0, 2(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: addq $48, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v_test_canonicalize_v2half: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vmovd %xmm2, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512F-NEXT: vmovd %xmm0, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v_test_canonicalize_v2half: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BW-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BW-NEXT: vmulss %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovd %xmm2, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BW-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX512BW-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX512BW-NEXT: vmovd %xmm0, (%rdi) +; AVX512BW-NEXT: retq +entry: + %val = load <2 x half>, <2 x half> addrspace(1)* %out + %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out + ret void +} + diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll new file mode 100644 index 000000000000..13ea53389411 --- /dev/null +++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll @@ -0,0 +1,636 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5 +; RUN: llc -mtriple=i686-- --mattr=-sse2 < %s | FileCheck %s -check-prefixes=SSE1 +; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE2 +; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1 +; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX2 +; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX1,AVX512F + +define float @canon_fp32_varargsf32(float %a) { +; SSE1-LABEL: canon_fp32_varargsf32: +; SSE1: # %bb.0: +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp32_varargsf32: +; SSE2: # %bb.0: +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp32_varargsf32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq + + %canonicalized = call float @llvm.canonicalize.f32(float %a) + ret float %canonicalized +} + +define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) { +; SSE1-LABEL: canon_fp32_varargsf80: +; SSE1: # %bb.0: +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp32_varargsf80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp32_varargsf80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: retq + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %a) + ret x86_fp80 %canonicalized +} + +define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) { +; SSE1-LABEL: complex_canonicalize_fmul_x86_fp80: +; SSE1: # %bb.0: # %entry +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fsub %st(1), %st +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fadd %st(2), %st +; SSE1-NEXT: fsubp %st, %st(1) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: fsubp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: complex_canonicalize_fmul_x86_fp80: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fsub %st(1), %st +; SSE2-NEXT: fld %st(0) +; SSE2-NEXT: fadd %st(2), %st +; SSE2-NEXT: fsubp %st, %st(1) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: fsubp %st, %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: complex_canonicalize_fmul_x86_fp80: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fsub %st(1), %st +; AVX1-NEXT: fld %st(0) +; AVX1-NEXT: fadd %st(2), %st +; AVX1-NEXT: fsubp %st, %st(1) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: fsubp %st, %st(1) +; AVX1-NEXT: retq +entry: + + %mul1 = fsub x86_fp80 %a, %b + %add = fadd x86_fp80 %mul1, %b + %mul2 = fsub x86_fp80 %add, %mul1 + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %mul2) + %result = fsub x86_fp80 %canonicalized, %b + ret x86_fp80 %result +} + +define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 { +; SSE1-LABEL: canonicalize_fp64: +; SSE1: # %bb.0: # %start +; SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; SSE1-NEXT: fucom %st(1) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fucom %st(0) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: ja .LBB3_2 +; SSE1-NEXT: # %bb.1: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fldz +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: .LBB3_2: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: jp .LBB3_4 +; SSE1-NEXT: # %bb.3: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: fldz +; SSE1-NEXT: .LBB3_4: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_fp64: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canonicalize_fp64: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canonicalize_fp64: +; AVX512F: # %bb.0: # %start +; AVX512F-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512F-NEXT: retq +start: + + %c = fcmp olt double %a, %b + %d = fcmp uno double %a, 0.000000e+00 + %or.cond.i.i = or i1 %d, %c + %e = select i1 %or.cond.i.i, double %b, double %a + %f = tail call double @llvm.canonicalize.f64(double %e) #2 + ret double %f +} + +define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 { +; SSE1-LABEL: canonicalize_fp32: +; SSE1: # %bb.0: # %start +; SSE1-NEXT: flds {{[0-9]+}}(%esp) +; SSE1-NEXT: flds {{[0-9]+}}(%esp) +; SSE1-NEXT: fucom %st(1) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fucom %st(0) +; SSE1-NEXT: fnstsw %ax +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: ja .LBB4_2 +; SSE1-NEXT: # %bb.1: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fldz +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: .LBB4_2: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: # kill: def $ah killed $ah killed $ax +; SSE1-NEXT: sahf +; SSE1-NEXT: jp .LBB4_4 +; SSE1-NEXT: # %bb.3: # %start +; SSE1-NEXT: fstp %st(1) +; SSE1-NEXT: fldz +; SSE1-NEXT: .LBB4_4: # %start +; SSE1-NEXT: fstp %st(0) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_fp32: +; SSE2: # %bb.0: # %start +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canonicalize_fp32: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canonicalize_fp32: +; AVX512F: # %bb.0: # %start +; AVX512F-NEXT: vmaxss %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 +; AVX512F-NEXT: retq +start: + + %cc = fcmp olt float %aa, %bb + %dd = fcmp uno float %aa, 0.000000e+00 + %or.cond.i.i.x = or i1 %dd, %cc + %ee = select i1 %or.cond.i.i.x, float %bb, float %aa + %ff = tail call float @llvm.canonicalize.f32(float %ee) #2 + ret float %ff +} + +define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_var_f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmuls (%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_var_f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movss %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_var_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovss %xmm0, (%rdi) +; AVX1-NEXT: retq + %val = load float, float addrspace(1)* %out + %canonicalized = call float @llvm.canonicalize.f32(float %val) + store float %canonicalized, float addrspace(1)* %out + ret void +} + +define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_x86_fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fldt (%eax) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmulp %st, %st(1) +; SSE1-NEXT: fstpt (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_x86_fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt (%rdi) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmulp %st, %st(1) +; SSE2-NEXT: fstpt (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_x86_fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt (%rdi) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmulp %st, %st(1) +; AVX1-NEXT: fstpt (%rdi) +; AVX1-NEXT: retq + + %val = load x86_fp80, x86_fp80 addrspace(1)* %out + %canonicalized = call x86_fp80 @llvm.canonicalize.f80(x86_fp80 %val) + store x86_fp80 %canonicalized, x86_fp80 addrspace(1)* %out + ret void +} + +define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { +; SSE1-LABEL: v_test_canonicalize_var_f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmull (%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: v_test_canonicalize_var_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movsd %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: v_test_canonicalize_var_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovsd %xmm0, (%rdi) +; AVX1-NEXT: retq + + %val = load double, double addrspace(1)* %out + %canonicalized = call double @llvm.canonicalize.f64(double %val) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +define void @canonicalize_undef(double addrspace(1)* %out) { +; SSE1-LABEL: canonicalize_undef: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: movl $2146959360, 4(%eax) # imm = 0x7FF80000 +; SSE1-NEXT: movl $0, (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canonicalize_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; SSE2-NEXT: movq %rax, (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canonicalize_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000 +; AVX1-NEXT: movq %rax, (%rdi) +; AVX1-NEXT: retq + + %canonicalized = call double @llvm.canonicalize.f64(double undef) + store double %canonicalized, double addrspace(1)* %out + ret void +} + +define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) { +; SSE1-LABEL: canon_fp32_varargsv4f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmuls {{[0-9]+}}(%esp) +; SSE1-NEXT: fstps 12(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstps 8(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstps 4(%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl $4 +; +; SSE2-LABEL: canon_fp32_varargsv4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canon_fp32_varargsv4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canon_fp32_varargsv4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a) + ret <4 x float> %canonicalized +} + +define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) { +; SSE1-LABEL: canon_fp64_varargsv4f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmull {{[0-9]+}}(%esp) +; SSE1-NEXT: fstpl 24(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstpl 16(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpl 8(%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl $4 +; +; SSE2-LABEL: canon_fp64_varargsv4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0] +; SSE2-NEXT: mulpd %xmm2, %xmm0 +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; AVX2-LABEL: canon_fp64_varargsv4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: canon_fp64_varargsv4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq + %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %a) + ret <4 x double> %canonicalized +} + +define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) { +; SSE1-LABEL: canon_fp80_varargsv2fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fldt {{[0-9]+}}(%esp) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmul %st, %st(1) +; SSE1-NEXT: fmulp %st, %st(2) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: retl +; +; SSE2-LABEL: canon_fp80_varargsv2fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fldt {{[0-9]+}}(%rsp) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmul %st, %st(1) +; SSE2-NEXT: fmulp %st, %st(2) +; SSE2-NEXT: fxch %st(1) +; SSE2-NEXT: retq +; +; AVX1-LABEL: canon_fp80_varargsv2fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fldt {{[0-9]+}}(%rsp) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmul %st, %st(1) +; AVX1-NEXT: fmulp %st, %st(2) +; AVX1-NEXT: fxch %st(1) +; AVX1-NEXT: retq + %canonicalized = call <2 x x86_fp80> @llvm.canonicalize.v2f80(<2 x x86_fp80> %a) + ret <2 x x86_fp80> %canonicalized +} + +define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_var_v4f32: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmuls (%eax) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmuls 4(%eax) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmuls 8(%eax) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmuls 12(%eax) +; SSE1-NEXT: fstps 12(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstps 8(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstps 4(%eax) +; SSE1-NEXT: fstps (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_var_v4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; AVX2-LABEL: vec_canonicalize_var_v4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulps (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec_canonicalize_var_v4f32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulps (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovaps %xmm0, (%rdi) +; AVX512F-NEXT: retq + %val = load <4 x float>, <4 x float> addrspace(1)* %out + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %val) + store <4 x float> %canonicalized, <4 x float> addrspace(1)* %out + ret void +} + +define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_var_v4f64: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fld1 +; SSE1-NEXT: fld %st(0) +; SSE1-NEXT: fmull (%eax) +; SSE1-NEXT: fld %st(1) +; SSE1-NEXT: fmull 8(%eax) +; SSE1-NEXT: fld %st(2) +; SSE1-NEXT: fmull 16(%eax) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fmull 24(%eax) +; SSE1-NEXT: fstpl 24(%eax) +; SSE1-NEXT: fxch %st(2) +; SSE1-NEXT: fstpl 16(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpl 8(%eax) +; SSE1-NEXT: fstpl (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_var_v4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0] +; SSE2-NEXT: movapd 16(%rdi), %xmm1 +; SSE2-NEXT: mulpd %xmm0, %xmm1 +; SSE2-NEXT: mulpd (%rdi), %xmm0 +; SSE2-NEXT: movapd %xmm0, (%rdi) +; SSE2-NEXT: movapd %xmm1, 16(%rdi) +; SSE2-NEXT: retq +; +; AVX2-LABEL: vec_canonicalize_var_v4f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: vec_canonicalize_var_v4f64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512F-NEXT: vmulpd (%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovapd %ymm0, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq + %val = load <4 x double>, <4 x double> addrspace(1)* %out + %canonicalized = call <4 x double> @llvm.canonicalize.v4f32(<4 x double> %val) + store <4 x double> %canonicalized, <4 x double> addrspace(1)* %out + ret void +} + +define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 { +; SSE1-LABEL: vec_canonicalize_x86_fp80: +; SSE1: # %bb.0: +; SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE1-NEXT: fldt 30(%eax) +; SSE1-NEXT: fldt 20(%eax) +; SSE1-NEXT: fldt 10(%eax) +; SSE1-NEXT: fldt (%eax) +; SSE1-NEXT: fld1 +; SSE1-NEXT: fmul %st, %st(1) +; SSE1-NEXT: fmul %st, %st(2) +; SSE1-NEXT: fmul %st, %st(3) +; SSE1-NEXT: fmulp %st, %st(4) +; SSE1-NEXT: fxch %st(3) +; SSE1-NEXT: fstpt 30(%eax) +; SSE1-NEXT: fxch %st(1) +; SSE1-NEXT: fstpt 20(%eax) +; SSE1-NEXT: fstpt 10(%eax) +; SSE1-NEXT: fstpt (%eax) +; SSE1-NEXT: retl +; +; SSE2-LABEL: vec_canonicalize_x86_fp80: +; SSE2: # %bb.0: +; SSE2-NEXT: fldt 30(%rdi) +; SSE2-NEXT: fldt 20(%rdi) +; SSE2-NEXT: fldt 10(%rdi) +; SSE2-NEXT: fldt (%rdi) +; SSE2-NEXT: fld1 +; SSE2-NEXT: fmul %st, %st(1) +; SSE2-NEXT: fmul %st, %st(2) +; SSE2-NEXT: fmul %st, %st(3) +; SSE2-NEXT: fmulp %st, %st(4) +; SSE2-NEXT: fxch %st(3) +; SSE2-NEXT: fstpt 30(%rdi) +; SSE2-NEXT: fxch %st(1) +; SSE2-NEXT: fstpt 20(%rdi) +; SSE2-NEXT: fstpt 10(%rdi) +; SSE2-NEXT: fstpt (%rdi) +; SSE2-NEXT: retq +; +; AVX1-LABEL: vec_canonicalize_x86_fp80: +; AVX1: # %bb.0: +; AVX1-NEXT: fldt 30(%rdi) +; AVX1-NEXT: fldt 20(%rdi) +; AVX1-NEXT: fldt 10(%rdi) +; AVX1-NEXT: fldt (%rdi) +; AVX1-NEXT: fld1 +; AVX1-NEXT: fmul %st, %st(1) +; AVX1-NEXT: fmul %st, %st(2) +; AVX1-NEXT: fmul %st, %st(3) +; AVX1-NEXT: fmulp %st, %st(4) +; AVX1-NEXT: fxch %st(3) +; AVX1-NEXT: fstpt 30(%rdi) +; AVX1-NEXT: fxch %st(1) +; AVX1-NEXT: fstpt 20(%rdi) +; AVX1-NEXT: fstpt 10(%rdi) +; AVX1-NEXT: fstpt (%rdi) +; AVX1-NEXT: retq + %val = load <4 x x86_fp80>, <4 x x86_fp80> addrspace(1)* %out + %canonicalized = call <4 x x86_fp80> @llvm.canonicalize.f80(<4 x x86_fp80> %val) + store <4 x x86_fp80> %canonicalized, <4 x x86_fp80> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 1d3b015f3c54..c350ed64280d 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -174,22 +174,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-LABEL: scalar_i128: ; X86: # %bb.0: # %_udiv-special-cases ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $156, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $176, %esp +; X86-NEXT: movl 20(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl 16(%ebp), %edx ; X86-NEXT: xorl %eax, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 12(%ebp), %ecx ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -198,32 +199,33 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%ebp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: movl 32(%ebp), %ebx ; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: xorl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 28(%ebp), %edi ; X86-NEXT: xorl %edx, %edi ; X86-NEXT: subl %edx, %edi -; X86-NEXT: sbbl %edx, %ebp ; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %edi, %ecx -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %edx @@ -232,359 +234,357 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bsrl %edi, %edi ; X86-NEXT: xorl $31, %edi ; X86-NEXT: orl $32, %edi -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %edi ; X86-NEXT: orl $64, %edi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: bsrl %ebx, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: orl $32, %edx -; X86-NEXT: testl %eax, %eax +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: orl $64, %edx -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %eax, %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: subl %edx, %edi ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: movl $0, %eax -; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: movl $127, %ecx ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: cmpl %edi, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: setb %cl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: cmovnel %ebx, %edx -; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovnel %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovnel %ebx, %eax ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: jne .LBB4_8 -; X86-NEXT: # %bb.1: # %_udiv-special-cases -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: xorl $127, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.8: # %_udiv-special-cases +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: xorl $127, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: je .LBB4_8 -; X86-NEXT: # %bb.2: # %udiv-bb1 +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: je .LBB4_9 +; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: xorps %xmm0, %xmm0 +; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: xorb $127, %al -; X86-NEXT: movb %al, %ch -; X86-NEXT: andb $7, %ch +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al +; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 148(%esp,%edi), %edx -; X86-NEXT: movl 152(%esp,%edi), %esi -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %edx -; X86-NEXT: notb %cl -; X86-NEXT: movl 144(%esp,%edi), %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: shrl %ebp -; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: orl %edx, %ebp -; X86-NEXT: movl 140(%esp,%edi), %edx -; X86-NEXT: movb %ch, %cl +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 152(%esp,%eax), %esi +; X86-NEXT: movl 156(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 144(%esp,%eax), %edx +; X86-NEXT: movl 148(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl $1, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $0, %edx -; X86-NEXT: jae .LBB4_3 +; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: -; X86-NEXT: xorl %edi, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: jmp .LBB4_7 -; X86-NEXT: .LBB4_3: # %udiv-preheader -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jmp .LBB4_7 +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: jmp .LBB4_9 +; X86-NEXT: .LBB4_2: # %udiv-preheader +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 108(%esp,%eax), %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %dl, %ch -; X86-NEXT: andb $7, %ch -; X86-NEXT: movb %dl, %cl -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %edx -; X86-NEXT: movl 104(%esp,%edx), %ebx -; X86-NEXT: movl 100(%esp,%edx), %edi -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebx, %ebp -; X86-NEXT: movl 92(%esp,%edx), %esi +; X86-NEXT: movl 104(%esp,%eax), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%edx), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: notb %cl -; X86-NEXT: addl %edi, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl 96(%esp,%eax), %esi +; X86-NEXT: movl 100(%esp,%eax), %eax ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: shrdl %cl, %ebx, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $-1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB4_4: # %udiv-do-while +; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: shldl $1, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebp +; X86-NEXT: shldl $1, %ebx, %edx +; X86-NEXT: shldl $1, %ecx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %edx +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: shldl $1, %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: orl %esi, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl $1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: andl $1, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: subl %ecx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $-1, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl $-1, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl $-1, %esi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %edi +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edi, %ecx -; X86-NEXT: jne .LBB4_4 -; X86-NEXT: # %bb.5: -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB4_3 +; X86-NEXT: # %bb.4: ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %ebp, %edx +; X86-NEXT: shldl $1, %ebx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: shldl $1, %eax, %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: shldl $1, %edi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: addl %edi, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: shldl $1, %eax, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: addl %esi, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: .LBB4_8: # %udiv-end +; X86-NEXT: .LBB4_9: # %udiv-end ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: xorl %ecx, %ebp -; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: subl %ecx, %edx ; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%ebp), %ecx +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %ebx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %ebp, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebp, %edi ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: mull %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: imull %eax, %ebx ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: imull %ebp, %edi +; X86-NEXT: imull %esi, %edi ; X86-NEXT: addl %edx, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %esi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 40(%ebp), %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %esi +; X86-NEXT: imull %edx, %ebx ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: addl $156, %esp +; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 58ea70e58028..16dc1d6b446c 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -174,379 +174,370 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-LABEL: scalar_i128: ; X86: # %bb.0: # %_udiv-special-cases ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $136, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: orl %esi, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 28(%ebp), %ebx +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: orl 36(%ebp), %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sete %bl -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sete %cl +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: orl 24(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: orl 20(%ebp), %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al -; X86-NEXT: orb %bl, %al -; X86-NEXT: movb %al, (%esp) # 1-byte Spill -; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: orb %cl, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: bsrl 36(%ebp), %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %edi, %edi -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl %ebp, %ebp -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl $31, %ebp -; X86-NEXT: orl $32, %ebp -; X86-NEXT: testl %eax, %eax -; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: orl $64, %ebp -; X86-NEXT: movl %edi, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: cmovnel %ecx, %ebp -; X86-NEXT: bsrl %esi, %edx -; X86-NEXT: movl %esi, %ebx +; X86-NEXT: bsrl %ebx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax +; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmovnel %edx, %eax +; X86-NEXT: orl $64, %eax +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: bsrl %ecx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx -; X86-NEXT: testl %esi, %esi +; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 16(%ebp), %edi ; X86-NEXT: bsrl %edi, %esi ; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx +; X86-NEXT: bsrl 12(%ebp), %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: orl $32, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: orl $64, %edx -; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: subl %edx, %ebp +; X86-NEXT: subl %edx, %eax ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: movl $127, %ecx -; X86-NEXT: cmpl %ebp, %ecx +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: setb %cl -; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: xorl $127, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %eax ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: testb %cl, %cl -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl $0, %edi -; X86-NEXT: cmovnel %edi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovnel %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmovnel %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovnel %edi, %ebx -; X86-NEXT: orb %cl, %al -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, %ah +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: movl $0, %esi +; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: cmovnel %esi, %ecx +; X86-NEXT: movl $0, %edx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 16(%ebp), %esi +; X86-NEXT: cmovnel %edx, %esi +; X86-NEXT: movl 12(%ebp), %edi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: orb %ah, %al +; X86-NEXT: movl 44(%ebp), %eax ; X86-NEXT: jne .LBB4_7 ; X86-NEXT: # %bb.1: # %udiv-bb1 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: xorps %xmm0, %xmm0 +; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 20(%ebp), %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 24(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: xorb $127, %al -; X86-NEXT: movb %al, %ch -; X86-NEXT: andb $7, %ch +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al +; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 128(%esp,%eax), %edx -; X86-NEXT: movl 132(%esp,%eax), %esi -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl 136(%esp,%eax), %edi +; X86-NEXT: movl 140(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edi, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %edx -; X86-NEXT: notb %cl -; X86-NEXT: movl 124(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: shrl %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 120(%esp,%eax), %eax -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %eax, %ebp -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl $1, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl 128(%esp,%eax), %ebx +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %edi +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: addl $1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl 20(%ebp), %ebx ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.5: +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: jmp .LBB4_6 ; X86-NEXT: .LBB4_2: # %udiv-preheader -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 12(%ebp), %edx ; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl 16(%ebp), %edx ; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 24(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movb %al, %ch -; X86-NEXT: andb $7, %ch ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al +; X86-NEXT: andb $12, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 84(%esp,%eax), %ebx +; X86-NEXT: movl 92(%esp,%eax), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esp,%eax), %esi -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edx -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esp,%eax), %ebp -; X86-NEXT: movl 76(%esp,%eax), %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: notb %cl -; X86-NEXT: addl %esi, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl 88(%esp,%eax), %edx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %edi +; X86-NEXT: movl 84(%esp,%eax), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shrdl %cl, %edx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebp), %eax ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 32(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %ebp, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %ebp +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ebx +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $1, %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $1, %edx, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %edx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: shldl $1, %ecx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl 40(%ebp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: andl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: andl 36(%ebp), %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl 32(%ebp), %edx +; X86-NEXT: andl 28(%ebp), %ecx +; X86-NEXT: subl %ecx, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: adcl $-1, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $-1, %edi +; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %ebx, %ecx -; X86-NEXT: movl (%esp), %edi # 4-byte Reload ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: .LBB4_6: # %udiv-loop-exit +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: shldl $1, %ebp, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NEXT: shldl $1, %ebx, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: addl %ebx, %ebx +; X86-NEXT: .LBB4_6: # %udiv-loop-exit +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx ; X86-NEXT: orl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shldl $1, %esi, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl 44(%ebp), %eax ; X86-NEXT: .LBB4_7: # %udiv-end -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %esi, %edi +; X86-NEXT: movl 40(%ebp), %edi +; X86-NEXT: imull %ecx, %edi ; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull 28(%ebp), %ebx +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: imull %edx, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull 32(%ebp) +; X86-NEXT: movl 16(%ebp), %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull 32(%ebp) +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %edx -; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl 12(%ebp), %ebx +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: addl $136, %esp +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll index 6fcebdb5116d..fb169a3777fb 100644 --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -22,7 +22,7 @@ define void @f() nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $176, %esp +; X86-NEXT: subl $160, %esp ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -96,18 +96,16 @@ define void @f() nounwind { ; X86-NEXT: addl $1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: andl $3, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb $65, %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movb %cl, %ch -; X86-NEXT: andb $7, %ch -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: negb %cl -; X86-NEXT: movsbl %cl, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %esi ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -117,29 +115,24 @@ define void @f() nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%esp,%esi), %edi -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shll %cl, %edi -; X86-NEXT: notb %cl -; X86-NEXT: movl 128(%esp,%esi), %ebx -; X86-NEXT: movl 132(%esp,%esi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shrl %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %ebx, %eax +; X86-NEXT: movl 112(%esp,%esi), %edi +; X86-NEXT: movl 116(%esp,%esi), %eax +; X86-NEXT: movl 120(%esp,%esi), %esi +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %ebx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: je .LBB0_13 ; X86-NEXT: # %bb.11: # %udiv-preheader -; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl %esi, %edi ; X86-NEXT: andl $3, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -148,26 +141,20 @@ define void @f() nounwind { ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movb %al, %ch -; X86-NEXT: andb $7, %ch -; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $15, %al -; X86-NEXT: movzbl %al, %edx -; X86-NEXT: movl 80(%esp,%edx), %edi -; X86-NEXT: movl 84(%esp,%edx), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: notb %cl -; X86-NEXT: movl 88(%esp,%edx), %ebx -; X86-NEXT: addl %ebx, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 72(%esp,%eax), %ebx +; X86-NEXT: movl 64(%esp,%eax), %esi +; X86-NEXT: movl 68(%esp,%eax), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrdl %cl, %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax @@ -175,70 +162,69 @@ define void @f() nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl $3, %eax -; X86-NEXT: andl $3, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $3, %edi +; X86-NEXT: andl $3, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_12: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: shldl $1, %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, %eax -; X86-NEXT: andl $2, %eax -; X86-NEXT: shrl %eax -; X86-NEXT: leal (%eax,%edi,2), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: andl $2, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%ebx,2), %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl $1, %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $3, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: shll $30, %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sarl $30, %eax -; X86-NEXT: sarl $31, %esi -; X86-NEXT: shrdl $1, %esi, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: andl $1, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $3, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: shll $30, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sarl $30, %edi +; X86-NEXT: sarl $31, %edx +; X86-NEXT: shrdl $1, %edx, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %esi, %edx ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: andl $3, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $3, %esi -; X86-NEXT: andl $3, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $3, %edi +; X86-NEXT: andl $3, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %edx, %eax ; X86-NEXT: jne .LBB0_12 diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index df3c25a8c42a..6be79edbe51e 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -13,26 +13,24 @@ define i256 @test1(i256 %a) nounwind { ; ILP-LABEL: test1: ; ILP: # %bb.0: ; ILP-NEXT: movq %rdi, %rax +; ILP-NEXT: xorps %xmm0, %xmm0 +; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; ILP-NEXT: leal (%rsi,%rsi), %ecx -; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; ILP-NEXT: addb $3, %cl ; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; ILP-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; ILP-NEXT: addb $3, %cl ; ILP-NEXT: movl %ecx, %edx ; ILP-NEXT: shrb $3, %dl -; ILP-NEXT: andb $7, %cl +; ILP-NEXT: andb $24, %dl ; ILP-NEXT: negb %dl ; ILP-NEXT: movsbq %dl, %rdx -; ILP-NEXT: movq -16(%rsp,%rdx), %rsi -; ILP-NEXT: movq -8(%rsp,%rdx), %rdi +; ILP-NEXT: movq -24(%rsp,%rdx), %rsi +; ILP-NEXT: movq -16(%rsp,%rdx), %rdi ; ILP-NEXT: shldq %cl, %rsi, %rdi -; ILP-NEXT: movq -32(%rsp,%rdx), %r8 -; ILP-NEXT: movq -24(%rsp,%rdx), %rdx +; ILP-NEXT: movq -40(%rsp,%rdx), %r8 +; ILP-NEXT: movq -32(%rsp,%rdx), %rdx ; ILP-NEXT: movq %r8, %r9 ; ILP-NEXT: shlq %cl, %r9 ; ILP-NEXT: movq %rdx, %r10 @@ -52,27 +50,25 @@ define i256 @test1(i256 %a) nounwind { ; HYBRID-LABEL: test1: ; HYBRID: # %bb.0: ; HYBRID-NEXT: movq %rdi, %rax -; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; HYBRID-NEXT: xorps %xmm0, %xmm0 +; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; HYBRID-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; HYBRID-NEXT: addl %esi, %esi -; HYBRID-NEXT: addb $3, %sil -; HYBRID-NEXT: movl %esi, %ecx -; HYBRID-NEXT: andb $7, %cl -; HYBRID-NEXT: shrb $3, %sil -; HYBRID-NEXT: negb %sil -; HYBRID-NEXT: movsbq %sil, %rdx -; HYBRID-NEXT: movq -16(%rsp,%rdx), %rsi -; HYBRID-NEXT: movq -8(%rsp,%rdx), %rdi +; HYBRID-NEXT: leal (%rsi,%rsi), %ecx +; HYBRID-NEXT: addb $3, %cl +; HYBRID-NEXT: movl %ecx, %edx +; HYBRID-NEXT: shrb $3, %dl +; HYBRID-NEXT: andb $24, %dl +; HYBRID-NEXT: negb %dl +; HYBRID-NEXT: movsbq %dl, %rdx +; HYBRID-NEXT: movq -24(%rsp,%rdx), %rsi +; HYBRID-NEXT: movq -16(%rsp,%rdx), %rdi ; HYBRID-NEXT: shldq %cl, %rsi, %rdi ; HYBRID-NEXT: movq %rdi, 24(%rax) -; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdi -; HYBRID-NEXT: movq -24(%rsp,%rdx), %rdx +; HYBRID-NEXT: movq -40(%rsp,%rdx), %rdi +; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdx ; HYBRID-NEXT: movq %rdx, %r8 ; HYBRID-NEXT: shldq %cl, %rdi, %r8 ; HYBRID-NEXT: movq %r8, 8(%rax) @@ -81,6 +77,7 @@ define i256 @test1(i256 %a) nounwind { ; HYBRID-NEXT: shlq %cl, %rsi ; HYBRID-NEXT: notb %cl ; HYBRID-NEXT: shrq %rdx +; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx ; HYBRID-NEXT: shrq %cl, %rdx ; HYBRID-NEXT: orq %rsi, %rdx ; HYBRID-NEXT: movq %rdx, 16(%rax) @@ -89,27 +86,25 @@ define i256 @test1(i256 %a) nounwind { ; BURR-LABEL: test1: ; BURR: # %bb.0: ; BURR-NEXT: movq %rdi, %rax -; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; BURR-NEXT: xorps %xmm0, %xmm0 +; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; BURR-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; BURR-NEXT: addl %esi, %esi -; BURR-NEXT: addb $3, %sil -; BURR-NEXT: movl %esi, %ecx -; BURR-NEXT: andb $7, %cl -; BURR-NEXT: shrb $3, %sil -; BURR-NEXT: negb %sil -; BURR-NEXT: movsbq %sil, %rdx -; BURR-NEXT: movq -16(%rsp,%rdx), %rsi -; BURR-NEXT: movq -8(%rsp,%rdx), %rdi +; BURR-NEXT: leal (%rsi,%rsi), %ecx +; BURR-NEXT: addb $3, %cl +; BURR-NEXT: movl %ecx, %edx +; BURR-NEXT: shrb $3, %dl +; BURR-NEXT: andb $24, %dl +; BURR-NEXT: negb %dl +; BURR-NEXT: movsbq %dl, %rdx +; BURR-NEXT: movq -24(%rsp,%rdx), %rsi +; BURR-NEXT: movq -16(%rsp,%rdx), %rdi ; BURR-NEXT: shldq %cl, %rsi, %rdi ; BURR-NEXT: movq %rdi, 24(%rax) -; BURR-NEXT: movq -32(%rsp,%rdx), %rdi -; BURR-NEXT: movq -24(%rsp,%rdx), %rdx +; BURR-NEXT: movq -40(%rsp,%rdx), %rdi +; BURR-NEXT: movq -32(%rsp,%rdx), %rdx ; BURR-NEXT: movq %rdx, %r8 ; BURR-NEXT: shldq %cl, %rdi, %r8 ; BURR-NEXT: movq %r8, 8(%rax) @@ -118,6 +113,7 @@ define i256 @test1(i256 %a) nounwind { ; BURR-NEXT: shlq %cl, %rsi ; BURR-NEXT: notb %cl ; BURR-NEXT: shrq %rdx +; BURR-NEXT: # kill: def $cl killed $cl killed $ecx ; BURR-NEXT: shrq %cl, %rdx ; BURR-NEXT: orq %rsi, %rdx ; BURR-NEXT: movq %rdx, 16(%rax) @@ -126,33 +122,31 @@ define i256 @test1(i256 %a) nounwind { ; SRC-LABEL: test1: ; SRC: # %bb.0: ; SRC-NEXT: movq %rdi, %rax -; SRC-NEXT: addl %esi, %esi -; SRC-NEXT: addb $3, %sil -; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; SRC-NEXT: leal (%rsi,%rsi), %edx +; SRC-NEXT: addb $3, %dl +; SRC-NEXT: xorps %xmm0, %xmm0 +; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; SRC-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SRC-NEXT: movl %esi, %edx -; SRC-NEXT: andb $7, %dl -; SRC-NEXT: shrb $3, %sil -; SRC-NEXT: negb %sil -; SRC-NEXT: movsbq %sil, %rsi -; SRC-NEXT: movq -16(%rsp,%rsi), %rdi +; SRC-NEXT: movl %edx, %ecx +; SRC-NEXT: shrb $3, %cl +; SRC-NEXT: andb $24, %cl +; SRC-NEXT: negb %cl +; SRC-NEXT: movsbq %cl, %rsi +; SRC-NEXT: movq -24(%rsp,%rsi), %rdi ; SRC-NEXT: movq %rdi, %r8 ; SRC-NEXT: movl %edx, %ecx ; SRC-NEXT: shlq %cl, %r8 ; SRC-NEXT: notb %cl -; SRC-NEXT: movq -32(%rsp,%rsi), %r9 -; SRC-NEXT: movq -24(%rsp,%rsi), %r10 +; SRC-NEXT: movq -40(%rsp,%rsi), %r9 +; SRC-NEXT: movq -32(%rsp,%rsi), %r10 ; SRC-NEXT: movq %r10, %r11 ; SRC-NEXT: shrq %r11 ; SRC-NEXT: shrq %cl, %r11 ; SRC-NEXT: orq %r8, %r11 -; SRC-NEXT: movq -8(%rsp,%rsi), %rsi +; SRC-NEXT: movq -16(%rsp,%rsi), %rsi ; SRC-NEXT: movl %edx, %ecx ; SRC-NEXT: shldq %cl, %rdi, %rsi ; SRC-NEXT: movq %r9, %rdi @@ -171,27 +165,25 @@ define i256 @test1(i256 %a) nounwind { ; LIN-NEXT: addb $3, %dl ; LIN-NEXT: movl %edx, %ecx ; LIN-NEXT: shrb $3, %cl +; LIN-NEXT: andb $24, %cl ; LIN-NEXT: negb %cl ; LIN-NEXT: movsbq %cl, %rsi -; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; LIN-NEXT: xorps %xmm0, %xmm0 +; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; LIN-NEXT: movq $1, -{{[0-9]+}}(%rsp) ; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; LIN-NEXT: movq -32(%rsp,%rsi), %rdi -; LIN-NEXT: andb $7, %dl +; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; LIN-NEXT: movq -40(%rsp,%rsi), %rdi ; LIN-NEXT: movq %rdi, %r8 ; LIN-NEXT: movl %edx, %ecx ; LIN-NEXT: shlq %cl, %r8 ; LIN-NEXT: movq %r8, (%rax) -; LIN-NEXT: movq -24(%rsp,%rsi), %r8 +; LIN-NEXT: movq -32(%rsp,%rsi), %r8 ; LIN-NEXT: movq %r8, %r9 ; LIN-NEXT: shldq %cl, %rdi, %r9 ; LIN-NEXT: movq %r9, 8(%rax) -; LIN-NEXT: movq -16(%rsp,%rsi), %rdi +; LIN-NEXT: movq -24(%rsp,%rsi), %rdi ; LIN-NEXT: movq %rdi, %r9 ; LIN-NEXT: shlq %cl, %r9 ; LIN-NEXT: shrq %r8 @@ -199,7 +191,7 @@ define i256 @test1(i256 %a) nounwind { ; LIN-NEXT: shrq %cl, %r8 ; LIN-NEXT: orq %r9, %r8 ; LIN-NEXT: movq %r8, 16(%rax) -; LIN-NEXT: movq -8(%rsp,%rsi), %rsi +; LIN-NEXT: movq -16(%rsp,%rsi), %rsi ; LIN-NEXT: movl %edx, %ecx ; LIN-NEXT: shldq %cl, %rdi, %rsi ; LIN-NEXT: movq %rsi, 24(%rax) diff --git a/llvm/test/CodeGen/X86/section-stats.ll b/llvm/test/CodeGen/X86/section-stats.ll index 94d0a965ac59..2cab7d18dec0 100644 --- a/llvm/test/CodeGen/X86/section-stats.ll +++ b/llvm/test/CodeGen/X86/section-stats.ll @@ -3,6 +3,8 @@ ; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC text sections ; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC read-write sections +; CHECK-DAG: 512 elf-object-writer - Total size of section headers table +; CHECK-DAG: 64 elf-object-writer - Total size of ELF headers target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 4fbe05cd1b2f..767bd772ab7a 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -10,49 +10,45 @@ define void @test_lshr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-LABEL: test_lshr_i128: ; i686: # %bb.0: # %entry ; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: andl $-16, %esp +; i686-NEXT: subl $48, %esp +; i686-NEXT: movl 24(%ebp), %ecx +; i686-NEXT: movl 8(%ebp), %eax +; i686-NEXT: movl 12(%ebp), %edx +; i686-NEXT: movl 16(%ebp), %esi +; i686-NEXT: movl 20(%ebp), %edi ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, (%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, %eax -; i686-NEXT: andb $7, %al -; i686-NEXT: shrb $3, %cl -; i686-NEXT: andb $15, %cl -; i686-NEXT: movzbl %cl, %ebp -; i686-NEXT: movl 4(%esp,%ebp), %edx -; i686-NEXT: movl %edx, %esi -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: notb %cl -; i686-NEXT: movl 8(%esp,%ebp), %ebx -; i686-NEXT: leal (%ebx,%ebx), %edi -; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %esi, %edi -; i686-NEXT: movl (%esp,%ebp), %esi -; i686-NEXT: movl 12(%esp,%ebp), %ebp -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %ebp, %ebx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: shrl %cl, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %ebp, 12(%eax) -; i686-NEXT: movl %ebx, 8(%eax) -; i686-NEXT: movl %esi, (%eax) -; i686-NEXT: movl %edi, 4(%eax) -; i686-NEXT: addl $32, %esp +; i686-NEXT: shrb $3, %al +; i686-NEXT: andb $12, %al +; i686-NEXT: movzbl %al, %edi +; i686-NEXT: movl 8(%esp,%edi), %eax +; i686-NEXT: movl 4(%esp,%edi), %ebx +; i686-NEXT: movl %ebx, %edx +; i686-NEXT: shrdl %cl, %eax, %edx +; i686-NEXT: movl (%esp,%edi), %esi +; i686-NEXT: movl 12(%esp,%edi), %edi +; i686-NEXT: shrdl %cl, %edi, %eax +; i686-NEXT: shrdl %cl, %ebx, %esi +; i686-NEXT: movl 40(%ebp), %ebx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shrl %cl, %edi +; i686-NEXT: movl %edi, 12(%ebx) +; i686-NEXT: movl %eax, 8(%ebx) +; i686-NEXT: movl %edx, 4(%ebx) +; i686-NEXT: movl %esi, (%ebx) +; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx @@ -81,50 +77,46 @@ define void @test_ashr_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-LABEL: test_ashr_i128: ; i686: # %bb.0: # %entry ; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: andl $-16, %esp +; i686-NEXT: subl $48, %esp +; i686-NEXT: movl 24(%ebp), %ecx +; i686-NEXT: movl 8(%ebp), %eax +; i686-NEXT: movl 12(%ebp), %edx +; i686-NEXT: movl 16(%ebp), %esi +; i686-NEXT: movl 20(%ebp), %edi ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, (%esp) -; i686-NEXT: sarl $31, %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: sarl $31, %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, %eax -; i686-NEXT: andb $7, %al -; i686-NEXT: shrb $3, %cl -; i686-NEXT: andb $15, %cl -; i686-NEXT: movzbl %cl, %ebp -; i686-NEXT: movl 4(%esp,%ebp), %edx -; i686-NEXT: movl %edx, %esi -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: notb %cl -; i686-NEXT: movl 8(%esp,%ebp), %ebx -; i686-NEXT: leal (%ebx,%ebx), %edi -; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %esi, %edi -; i686-NEXT: movl (%esp,%ebp), %esi -; i686-NEXT: movl 12(%esp,%ebp), %ebp -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %ebp, %ebx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: sarl %cl, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %ebp, 12(%eax) -; i686-NEXT: movl %ebx, 8(%eax) -; i686-NEXT: movl %esi, (%eax) -; i686-NEXT: movl %edi, 4(%eax) -; i686-NEXT: addl $32, %esp +; i686-NEXT: shrb $3, %al +; i686-NEXT: andb $12, %al +; i686-NEXT: movzbl %al, %edi +; i686-NEXT: movl 8(%esp,%edi), %eax +; i686-NEXT: movl 4(%esp,%edi), %ebx +; i686-NEXT: movl %ebx, %edx +; i686-NEXT: shrdl %cl, %eax, %edx +; i686-NEXT: movl (%esp,%edi), %esi +; i686-NEXT: movl 12(%esp,%edi), %edi +; i686-NEXT: shrdl %cl, %edi, %eax +; i686-NEXT: shrdl %cl, %ebx, %esi +; i686-NEXT: movl 40(%ebp), %ebx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: sarl %cl, %edi +; i686-NEXT: movl %edi, 12(%ebx) +; i686-NEXT: movl %eax, 8(%ebx) +; i686-NEXT: movl %edx, 4(%ebx) +; i686-NEXT: movl %esi, (%ebx) +; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx @@ -154,15 +146,17 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-LABEL: test_shl_i128: ; i686: # %bb.0: # %entry ; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi +; i686-NEXT: andl $-16, %esp +; i686-NEXT: subl $48, %esp +; i686-NEXT: movl 24(%ebp), %ecx +; i686-NEXT: movl 8(%ebp), %eax +; i686-NEXT: movl 12(%ebp), %edx +; i686-NEXT: movl 16(%ebp), %esi +; i686-NEXT: movl 20(%ebp), %edi ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -172,36 +166,27 @@ define void @test_shl_i128(i128 %x, i128 %a, ptr nocapture %r) nounwind { ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, (%esp) ; i686-NEXT: movl %ecx, %eax -; i686-NEXT: andb $7, %al -; i686-NEXT: shrb $3, %cl -; i686-NEXT: andb $15, %cl -; i686-NEXT: negb %cl -; i686-NEXT: movsbl %cl, %ebp -; i686-NEXT: movl 24(%esp,%ebp), %ebx -; i686-NEXT: movl %ebx, %edx -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %edx -; i686-NEXT: notb %cl -; i686-NEXT: movl 20(%esp,%ebp), %edi -; i686-NEXT: movl %edi, %esi -; i686-NEXT: shrl %esi -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %edx, %esi -; i686-NEXT: movl 16(%esp,%ebp), %edx -; i686-NEXT: movl 28(%esp,%ebp), %ebp -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shldl %cl, %ebx, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %ebp, 12(%ecx) -; i686-NEXT: movl %edx, %ebx -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %ebx -; i686-NEXT: shldl %cl, %edx, %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl %edi, 4(%eax) -; i686-NEXT: movl %ebx, (%eax) -; i686-NEXT: movl %esi, 8(%eax) -; i686-NEXT: addl $32, %esp +; i686-NEXT: shrb $3, %al +; i686-NEXT: andb $12, %al +; i686-NEXT: negb %al +; i686-NEXT: movsbl %al, %edi +; i686-NEXT: movl 20(%esp,%edi), %eax +; i686-NEXT: movl 24(%esp,%edi), %ebx +; i686-NEXT: movl %ebx, %esi +; i686-NEXT: shldl %cl, %eax, %esi +; i686-NEXT: movl 16(%esp,%edi), %edx +; i686-NEXT: movl 28(%esp,%edi), %edi +; i686-NEXT: shldl %cl, %ebx, %edi +; i686-NEXT: movl 40(%ebp), %ebx +; i686-NEXT: movl %edi, 12(%ebx) +; i686-NEXT: movl %esi, 8(%ebx) +; i686-NEXT: movl %edx, %esi +; i686-NEXT: shll %cl, %esi +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shldl %cl, %edx, %eax +; i686-NEXT: movl %eax, 4(%ebx) +; i686-NEXT: movl %esi, (%ebx) +; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx @@ -264,104 +249,93 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-LABEL: test_lshr_v2i128: ; i686: # %bb.0: # %entry ; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $100, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; i686-NEXT: andl $-16, %esp +; i686-NEXT: subl $112, %esp +; i686-NEXT: movl 40(%ebp), %edx +; i686-NEXT: movl 24(%ebp), %eax +; i686-NEXT: movl 28(%ebp), %ecx +; i686-NEXT: movl 32(%ebp), %esi +; i686-NEXT: movl 20(%ebp), %edi ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) +; i686-NEXT: movl 16(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 12(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 8(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 36(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl %esi, %ecx -; i686-NEXT: andl $7, %ecx +; i686-NEXT: movl %edx, %ebx +; i686-NEXT: andl $31, %ebx +; i686-NEXT: shrl $3, %edx +; i686-NEXT: andl $12, %edx +; i686-NEXT: movl 40(%esp,%edx), %eax +; i686-NEXT: movl 36(%esp,%edx), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: shrdl %cl, %eax, %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 32(%esp,%edx), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl $3, %esi -; i686-NEXT: andl $15, %esi -; i686-NEXT: movl 40(%esp,%esi), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl %cl, %eax -; i686-NEXT: notl %ecx -; i686-NEXT: movl 44(%esp,%esi), %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: addl %edx, %edx -; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edx -; i686-NEXT: orl %eax, %edx +; i686-NEXT: movl 44(%esp,%edx), %edx ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 36(%esp,%esi), %eax +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl %ebx, %esi +; i686-NEXT: shrdl %cl, %edx, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, %edx -; i686-NEXT: andl $7, %edx -; i686-NEXT: shrl $3, %ebx -; i686-NEXT: andl $15, %ebx -; i686-NEXT: movl 72(%esp,%ebx), %ebp -; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %ebp -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: notl %ecx -; i686-NEXT: movl 76(%esp,%ebx), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: leal (%eax,%eax), %edi -; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: orl %ebp, %edi -; i686-NEXT: movl 48(%esp,%esi), %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl 56(%ebp), %edx +; i686-NEXT: movl %edx, %eax +; i686-NEXT: andl $31, %eax +; i686-NEXT: shrl $3, %edx +; i686-NEXT: andl $12, %edx +; i686-NEXT: movl 72(%esp,%edx), %ebx +; i686-NEXT: movl 68(%esp,%edx), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shrdl %cl, %ebx, %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 64(%esp,%edx), %edi +; i686-NEXT: movl 76(%esp,%edx), %edx +; i686-NEXT: shrdl %cl, %edx, %ebx +; i686-NEXT: movl %esi, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl 68(%esp,%ebx), %ecx -; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill -; i686-NEXT: movl 80(%esp,%ebx), %esi -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, %ebx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: movl %eax, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shrdl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shrl %cl, %ebp -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %esi, 28(%ecx) -; i686-NEXT: movl %ebx, 24(%ecx) -; i686-NEXT: movl (%esp), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 16(%ecx) -; i686-NEXT: movl %ebp, 12(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 8(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %edi, 20(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 4(%ecx) -; i686-NEXT: addl $100, %esp +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: shrl %cl, %edx +; i686-NEXT: movl 72(%ebp), %eax +; i686-NEXT: movl %edx, 28(%eax) +; i686-NEXT: movl %ebx, 24(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 20(%eax) +; i686-NEXT: movl %edi, 16(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 12(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 8(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 4(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx @@ -402,107 +376,96 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-LABEL: test_ashr_v2i128: ; i686: # %bb.0: # %entry ; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $92, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: sarl $31, %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) +; i686-NEXT: andl $-16, %esp +; i686-NEXT: subl $112, %esp +; i686-NEXT: movl 40(%ebp), %edx +; i686-NEXT: movl 24(%ebp), %eax +; i686-NEXT: movl 28(%ebp), %ecx +; i686-NEXT: movl 32(%ebp), %esi +; i686-NEXT: movl 16(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 12(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 8(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 20(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: sarl $31, %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 36(%ebp), %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; i686-NEXT: sarl $31, %eax -; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, %ebx -; i686-NEXT: andl $7, %ebx -; i686-NEXT: shrl $3, %edi -; i686-NEXT: andl $15, %edi -; i686-NEXT: movl 32(%esp,%edi), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrl %cl, %eax +; i686-NEXT: sarl $31, %edi +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, %eax +; i686-NEXT: andl $31, %eax +; i686-NEXT: shrl $3, %edx +; i686-NEXT: andl $12, %edx +; i686-NEXT: movl 40(%esp,%edx), %esi +; i686-NEXT: movl 36(%esp,%edx), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 32(%esp,%edx), %ecx +; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 44(%esp,%edx), %edx +; i686-NEXT: movl %edx, (%esp) # 4-byte Spill +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shrdl %cl, %edx, %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 56(%ebp), %edx +; i686-NEXT: movl %edx, %ebx +; i686-NEXT: andl $31, %ebx +; i686-NEXT: shrl $3, %edx +; i686-NEXT: andl $12, %edx +; i686-NEXT: movl 72(%esp,%edx), %esi +; i686-NEXT: movl 68(%esp,%edx), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: notl %ecx -; i686-NEXT: movl 36(%esp,%edi), %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: addl %edx, %edx -; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %edx -; i686-NEXT: orl %eax, %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebp, %eax -; i686-NEXT: movl %ebp, %edx -; i686-NEXT: andl $7, %edx -; i686-NEXT: shrl $3, %eax -; i686-NEXT: andl $15, %eax -; i686-NEXT: movl 64(%esp,%eax), %ebp -; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %eax, (%esp) # 4-byte Spill -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrl %cl, %ebp -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: notl %ecx -; i686-NEXT: movl 68(%esp,%eax), %esi -; i686-NEXT: leal (%esi,%esi), %eax -; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shll %cl, %eax -; i686-NEXT: orl %ebp, %eax -; i686-NEXT: movl 28(%esp,%edi), %ecx +; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 64(%esp,%edx), %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 40(%esp,%edi), %edi +; i686-NEXT: movl 76(%esp,%edx), %edx ; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: shrdl %cl, %edx, %esi +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl (%esp), %ecx # 4-byte Reload -; i686-NEXT: movl 60(%esp,%ecx), %ebp -; i686-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 72(%esp,%ecx), %ebp -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: shrdl %cl, %ebp, %esi -; i686-NEXT: movl %esi, (%esp) # 4-byte Spill +; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: sarl %cl, %edi -; i686-NEXT: movl %edx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, %ebx -; i686-NEXT: sarl %cl, %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %ebp, 28(%ecx) -; i686-NEXT: movl (%esp), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 24(%ecx) -; i686-NEXT: movl %ebx, 16(%ecx) -; i686-NEXT: movl %edi, 12(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 8(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %eax, 20(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 4(%ecx) -; i686-NEXT: addl $92, %esp +; i686-NEXT: shrdl %cl, %eax, %edi +; i686-NEXT: sarl %cl, %edx +; i686-NEXT: movl 72(%ebp), %eax +; i686-NEXT: movl %edx, 28(%eax) +; i686-NEXT: movl %esi, 24(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 20(%eax) +; i686-NEXT: movl %edi, 16(%eax) +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 12(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 8(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 4(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx @@ -546,112 +509,106 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-LABEL: test_shl_v2i128: ; i686: # %bb.0: # %entry ; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $100, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi -; i686-NEXT: movl {{[0-9]+}}(%esp), %edi -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: andl $-16, %esp +; i686-NEXT: subl $128, %esp +; i686-NEXT: movl 40(%ebp), %edi +; i686-NEXT: movl 24(%ebp), %eax +; i686-NEXT: movl 28(%ebp), %ecx +; i686-NEXT: movl 32(%ebp), %edx +; i686-NEXT: movl 20(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 16(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 12(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 8(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 36(%ebp), %esi ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebp, %ecx -; i686-NEXT: shrl $3, %ebp -; i686-NEXT: andl $15, %ebp +; i686-NEXT: movl %edi, %ebx +; i686-NEXT: shrl $3, %ebx +; i686-NEXT: andl $12, %ebx ; i686-NEXT: leal {{[0-9]+}}(%esp), %eax -; i686-NEXT: subl %ebp, %eax +; i686-NEXT: subl %ebx, %eax ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl 8(%eax), %edx -; i686-NEXT: movl %edx, (%esp) # 4-byte Spill -; i686-NEXT: andl $7, %ecx +; i686-NEXT: movl (%eax), %esi +; i686-NEXT: movl 4(%eax), %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%eax), %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %edi, %ecx +; i686-NEXT: andl $31, %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shll %cl, %edx -; i686-NEXT: movl 4(%eax), %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl %esi -; i686-NEXT: notl %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %edx, %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl (%eax), %eax +; i686-NEXT: shldl %cl, %edx, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %edx +; i686-NEXT: movl 56(%ebp), %eax +; i686-NEXT: movl %eax, %edx ; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $15, %edx -; i686-NEXT: leal {{[0-9]+}}(%esp), %esi -; i686-NEXT: subl %edx, %esi +; i686-NEXT: andl $12, %edx +; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx +; i686-NEXT: subl %edx, %ecx ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: andl $7, %ebx -; i686-NEXT: movl 8(%esi), %edi +; i686-NEXT: movl (%ecx), %edi ; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl 4(%esi), %eax +; i686-NEXT: movl 4(%ecx), %edi +; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%ecx), %ecx +; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: andl $31, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl %eax -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: notl %ecx +; i686-NEXT: movl %ecx, %eax +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, %eax -; i686-NEXT: orl %edi, %eax -; i686-NEXT: movl (%esi), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi +; i686-NEXT: shldl %cl, %edi, %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %esi, %eax ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shll %cl, %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: negl %ebp -; i686-NEXT: movl 64(%esp,%ebp), %esi -; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: movl (%esp), %edi # 4-byte Reload -; i686-NEXT: shldl %cl, %edi, %esi -; i686-NEXT: movl %esi, (%esp) # 4-byte Spill +; i686-NEXT: negl %ebx +; i686-NEXT: movl 76(%esp,%ebx), %ebx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shldl %cl, %esi, %ebp +; i686-NEXT: shldl %cl, %esi, %ebx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; i686-NEXT: movl %edi, %esi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shll %cl, %esi +; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %edx -; i686-NEXT: movl 96(%esp,%edx), %edx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shldl %cl, %ebx, %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %edx, 28(%ecx) -; i686-NEXT: movl %ebp, 20(%ecx) -; i686-NEXT: movl %edi, 16(%ecx) -; i686-NEXT: movl (%esp), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 12(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 4(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %eax, 24(%ecx) +; i686-NEXT: movl 108(%esp,%edx), %edx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 8(%ecx) -; i686-NEXT: addl $100, %esp +; i686-NEXT: shldl %cl, %eax, %edx +; i686-NEXT: movl 72(%ebp), %eax +; i686-NEXT: movl %edx, 28(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 24(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 20(%eax) +; i686-NEXT: movl %esi, 16(%eax) +; i686-NEXT: movl %ebx, 12(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 8(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 4(%eax) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll index e1466aebf422..128e2199fb56 100644 --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -8,98 +8,78 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-LABEL: shift1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $92, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $112, %esp +; CHECK-NEXT: movl 40(%ebp), %ecx +; CHECK-NEXT: movl 8(%ebp), %eax +; CHECK-NEXT: movl 12(%ebp), %edx +; CHECK-NEXT: movl 16(%ebp), %esi +; CHECK-NEXT: movl 32(%ebp), %edi +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl 28(%ebp), %edi +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl 24(%ebp), %edi ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl 20(%ebp), %edi +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl 36(%ebp), %edi +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: sarl $31, %esi -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: andb $7, %al -; CHECK-NEXT: shrb $3, %cl -; CHECK-NEXT: movzbl %cl, %ebp -; CHECK-NEXT: movl 32(%esp,%ebp), %esi +; CHECK-NEXT: shrb $5, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movl 40(%esp,%eax,4), %edx +; CHECK-NEXT: movl 36(%esp,%eax,4), %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl %cl, %esi -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: notb %dl -; CHECK-NEXT: movl 36(%esp,%ebp), %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: leal (%ecx,%ecx), %edi -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: orl %esi, %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 40(%esp,%ebp), %esi +; CHECK-NEXT: shrdl %cl, %edx, %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl %cl, %esi -; CHECK-NEXT: movl 44(%esp,%ebp), %ecx -; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill -; CHECK-NEXT: leal (%ecx,%ecx), %edi -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl 44(%esp,%eax,4), %esi +; CHECK-NEXT: shrdl %cl, %esi, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 48(%esp,%eax,4), %ebx +; CHECK-NEXT: shrdl %cl, %ebx, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 52(%esp,%eax,4), %esi +; CHECK-NEXT: shrdl %cl, %esi, %ebx +; CHECK-NEXT: movl 56(%esp,%eax,4), %edx +; CHECK-NEXT: shrdl %cl, %edx, %esi +; CHECK-NEXT: movl 32(%esp,%eax,4), %edi ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 48(%esp,%ebp), %ebx -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl %cl, %ebx -; CHECK-NEXT: movl 52(%esp,%ebp), %edi -; CHECK-NEXT: leal (%edi,%edi), %esi -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: orl %ebx, %esi -; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; CHECK-NEXT: movl 28(%esp,%ebp), %edx -; CHECK-NEXT: movl 56(%esp,%ebp), %ebx -; CHECK-NEXT: shrdl %cl, %ebx, %edi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; CHECK-NEXT: shrdl %cl, %ebp, %edx -; CHECK-NEXT: sarl %cl, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %ebx, 28(%eax) -; CHECK-NEXT: movl %edi, 24(%eax) -; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 16(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 8(%eax) -; CHECK-NEXT: movl %edx, (%eax) -; CHECK-NEXT: movl %esi, 20(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 12(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 4(%eax) -; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: movl 60(%esp,%eax,4), %eax +; CHECK-NEXT: shrdl %cl, %eax, %edx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: sarl %cl, %eax +; CHECK-NEXT: movl 72(%ebp), %ecx +; CHECK-NEXT: movl %eax, 28(%ecx) +; CHECK-NEXT: movl %edx, 24(%ecx) +; CHECK-NEXT: movl %esi, 20(%ecx) +; CHECK-NEXT: movl %ebx, 16(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 12(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 8(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 4(%ecx) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, (%ecx) +; CHECK-NEXT: leal -12(%ebp), %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx @@ -120,42 +100,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-X64-O0-NEXT: movb %r8b, %dl -; CHECK-X64-O0-NEXT: movb %dl, %cl -; CHECK-X64-O0-NEXT: andb $7, %cl +; CHECK-X64-O0-NEXT: movb %r8b, %cl ; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-X64-O0-NEXT: shrb $3, %dl +; CHECK-X64-O0-NEXT: movb %cl, %dl +; CHECK-X64-O0-NEXT: shrb $6, %dl ; CHECK-X64-O0-NEXT: movzbl %dl, %edx ; CHECK-X64-O0-NEXT: movl %edx, %edi -; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx -; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8 -; CHECK-X64-O0-NEXT: movq %r8, %r9 -; CHECK-X64-O0-NEXT: shrq %cl, %r9 -; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: notb %cl -; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi -; CHECK-X64-O0-NEXT: movq %rsi, %r10 -; CHECK-X64-O0-NEXT: addq %r10, %r10 -; CHECK-X64-O0-NEXT: shlq %cl, %r10 +; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %rsi +; CHECK-X64-O0-NEXT: movq -72(%rsp,%rdi,8), %r8 +; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r9 +; CHECK-X64-O0-NEXT: movq %r9, %rdx +; CHECK-X64-O0-NEXT: shrdq %cl, %rsi, %rdx ; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: orq %r10, %r9 -; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi +; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rdi ; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi ; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx +; CHECK-X64-O0-NEXT: shrdq %cl, %r9, %r8 ; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-X64-O0-NEXT: sarq %cl, %rdi ; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax) ; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax) -; CHECK-X64-O0-NEXT: movq %rdx, (%rax) -; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax) +; CHECK-X64-O0-NEXT: movq %rdx, 8(%rax) +; CHECK-X64-O0-NEXT: movq %rcx, (%rax) ; CHECK-X64-O0-NEXT: retq ; ; CHECK-X64-O2-LABEL: shift1: ; CHECK-X64-O2: # %bb.0: # %entry -; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) @@ -165,29 +138,23 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movl %r8d, %eax -; CHECK-X64-O2-NEXT: andb $7, %al -; CHECK-X64-O2-NEXT: shrb $3, %r8b -; CHECK-X64-O2-NEXT: movzbl %r8b, %edx -; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi -; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi -; CHECK-X64-O2-NEXT: movq %rdi, %r8 -; CHECK-X64-O2-NEXT: movl %eax, %ecx -; CHECK-X64-O2-NEXT: shrq %cl, %r8 -; CHECK-X64-O2-NEXT: notb %cl -; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10 -; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11 -; CHECK-X64-O2-NEXT: shlq %cl, %r11 -; CHECK-X64-O2-NEXT: orq %r8, %r11 -; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx -; CHECK-X64-O2-NEXT: movl %eax, %ecx -; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10 -; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O2-NEXT: movl %r8d, %ecx +; CHECK-X64-O2-NEXT: shrb $6, %cl +; CHECK-X64-O2-NEXT: movzbl %cl, %edx +; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %rsi +; CHECK-X64-O2-NEXT: movq -72(%rsp,%rdx,8), %rdi +; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %r9 +; CHECK-X64-O2-NEXT: movq %r9, %r10 +; CHECK-X64-O2-NEXT: movl %r8d, %ecx +; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10 +; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rdx +; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi +; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi ; CHECK-X64-O2-NEXT: sarq %cl, %rdx -; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9) -; CHECK-X64-O2-NEXT: movq %r10, 16(%r9) -; CHECK-X64-O2-NEXT: movq %rsi, (%r9) -; CHECK-X64-O2-NEXT: movq %r11, 8(%r9) +; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax) +; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax) +; CHECK-X64-O2-NEXT: movq %r10, 8(%rax) +; CHECK-X64-O2-NEXT: movq %rdi, (%rax) ; CHECK-X64-O2-NEXT: retq entry: %0 = ashr i256 %x, %a @@ -199,11 +166,13 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-LABEL: shift2: ; CHECK: # %bb.0: ; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $92, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: andl $-16, %esp +; CHECK-NEXT: subl $112, %esp +; CHECK-NEXT: movl 12(%ebp), %ecx ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -220,68 +189,54 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) -; CHECK-NEXT: movb %al, %ch -; CHECK-NEXT: andb $7, %ch +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrb $3, %al +; CHECK-NEXT: andb $28, %al ; CHECK-NEXT: negb %al ; CHECK-NEXT: movsbl %al, %eax -; CHECK-NEXT: movl 68(%esp,%eax), %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movb %ch, %cl -; CHECK-NEXT: shll %cl, %edx -; CHECK-NEXT: notb %cl -; CHECK-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; CHECK-NEXT: movl 64(%esp,%eax), %ebp -; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shrl %ebp -; CHECK-NEXT: shrl %cl, %ebp -; CHECK-NEXT: orl %edx, %ebp -; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 76(%esp,%eax), %edx -; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movb %ch, %cl -; CHECK-NEXT: shll %cl, %edx -; CHECK-NEXT: movl 72(%esp,%eax), %ebx -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: shrl %ebx -; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; CHECK-NEXT: shrl %cl, %ebx -; CHECK-NEXT: orl %edx, %ebx -; CHECK-NEXT: movl 84(%esp,%eax), %esi +; CHECK-NEXT: movl 68(%esp,%eax), %esi ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movb %ch, %cl -; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: movl 80(%esp,%eax), %edi -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: shrl %edx -; CHECK-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; CHECK-NEXT: shrl %cl, %edx -; CHECK-NEXT: orl %esi, %edx -; CHECK-NEXT: movb %ch, %cl -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; CHECK-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: movl 72(%esp,%eax), %edx +; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: shldl %cl, %esi, %edi -; CHECK-NEXT: movl 60(%esp,%eax), %ebp -; CHECK-NEXT: movl 88(%esp,%eax), %esi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: shldl %cl, %eax, %esi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 76(%esp,%eax), %esi +; CHECK-NEXT: movl %esi, %edi +; CHECK-NEXT: shldl %cl, %edx, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 80(%esp,%eax), %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: shldl %cl, %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 84(%esp,%eax), %esi +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: shldl %cl, %edx, %ebx +; CHECK-NEXT: movl 88(%esp,%eax), %edi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: shldl %cl, %esi, %edx +; CHECK-NEXT: movl 64(%esp,%eax), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 92(%esp,%eax), %esi +; CHECK-NEXT: shldl %cl, %edi, %esi +; CHECK-NEXT: movl 8(%ebp), %eax ; CHECK-NEXT: movl %esi, 28(%eax) -; CHECK-NEXT: movl %edi, 20(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; CHECK-NEXT: movl %esi, 12(%eax) -; CHECK-NEXT: movl %ebp, %esi -; CHECK-NEXT: shll %cl, %esi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; CHECK-NEXT: shldl %cl, %ebp, %edi -; CHECK-NEXT: movl %edi, 4(%eax) -; CHECK-NEXT: movl %esi, (%eax) ; CHECK-NEXT: movl %edx, 24(%eax) -; CHECK-NEXT: movl %ebx, 16(%eax) -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movl %ecx, 8(%eax) -; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: movl %ebx, 20(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: movl %edx, 16(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: movl %edx, 12(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: movl %edx, 8(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: shll %cl, %edx +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; CHECK-NEXT: shldl %cl, %edi, %esi +; CHECK-NEXT: movl %esi, 4(%eax) +; CHECK-NEXT: movl %edx, (%eax) +; CHECK-NEXT: leal -12(%ebp), %esp ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %edi ; CHECK-NEXT: popl %ebx @@ -299,77 +254,64 @@ define i256 @shift2(i256 %c) nounwind ; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-X64-O0-NEXT: movb %sil, %dl -; CHECK-X64-O0-NEXT: movb %dl, %cl -; CHECK-X64-O0-NEXT: andb $7, %cl +; CHECK-X64-O0-NEXT: movb %sil, %cl ; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-X64-O0-NEXT: movb %cl, %dl ; CHECK-X64-O0-NEXT: shrb $3, %dl +; CHECK-X64-O0-NEXT: andb $24, %dl ; CHECK-X64-O0-NEXT: negb %dl -; CHECK-X64-O0-NEXT: movsbq %dl, %rdx -; CHECK-X64-O0-NEXT: movq -16(%rsp,%rdx), %rsi -; CHECK-X64-O0-NEXT: movq %rsi, %r10 -; CHECK-X64-O0-NEXT: shlq %cl, %r10 +; CHECK-X64-O0-NEXT: movsbq %dl, %r8 +; CHECK-X64-O0-NEXT: movq -40(%rsp,%r8), %r9 +; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %rdx +; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %r10 +; CHECK-X64-O0-NEXT: movq %r10, %rsi +; CHECK-X64-O0-NEXT: shldq %cl, %rdx, %rsi ; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: notb %cl -; CHECK-X64-O0-NEXT: movq -32(%rsp,%rdx), %r9 -; CHECK-X64-O0-NEXT: movq -24(%rsp,%rdx), %r8 -; CHECK-X64-O0-NEXT: movq %r8, %r11 -; CHECK-X64-O0-NEXT: shrq %r11 -; CHECK-X64-O0-NEXT: shrq %cl, %r11 -; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: orq %r11, %r10 -; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-X64-O0-NEXT: movq -8(%rsp,%rdx), %rdx -; CHECK-X64-O0-NEXT: shldq %cl, %rsi, %rdx +; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r8 +; CHECK-X64-O0-NEXT: shldq %cl, %r10, %r8 ; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: movq %r9, %rsi -; CHECK-X64-O0-NEXT: shlq %cl, %rsi +; CHECK-X64-O0-NEXT: movq %r9, %r10 +; CHECK-X64-O0-NEXT: shlq %cl, %r10 ; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload -; CHECK-X64-O0-NEXT: shldq %cl, %r9, %r8 +; CHECK-X64-O0-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-X64-O0-NEXT: shldq %cl, %r9, %rdx ; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; CHECK-X64-O0-NEXT: movq %r8, 8(%rdi) -; CHECK-X64-O0-NEXT: movq %rsi, (%rdi) -; CHECK-X64-O0-NEXT: movq %rdx, 24(%rdi) -; CHECK-X64-O0-NEXT: movq %rcx, 16(%rdi) +; CHECK-X64-O0-NEXT: movq %r8, 24(%rdi) +; CHECK-X64-O0-NEXT: movq %rsi, 16(%rdi) +; CHECK-X64-O0-NEXT: movq %rdx, 8(%rdi) +; CHECK-X64-O0-NEXT: movq %rcx, (%rdi) ; CHECK-X64-O0-NEXT: retq ; ; CHECK-X64-O2-LABEL: shift2: ; CHECK-X64-O2: # %bb.0: +; CHECK-X64-O2-NEXT: movq %rsi, %rcx ; CHECK-X64-O2-NEXT: movq %rdi, %rax -; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: xorps %xmm0, %xmm0 +; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-X64-O2-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; CHECK-X64-O2-NEXT: movl %esi, %edx -; CHECK-X64-O2-NEXT: andb $7, %dl -; CHECK-X64-O2-NEXT: shrb $3, %sil -; CHECK-X64-O2-NEXT: negb %sil -; CHECK-X64-O2-NEXT: movsbq %sil, %rsi -; CHECK-X64-O2-NEXT: movq -16(%rsp,%rsi), %rdi -; CHECK-X64-O2-NEXT: movq %rdi, %r8 -; CHECK-X64-O2-NEXT: movl %edx, %ecx +; CHECK-X64-O2-NEXT: movl %ecx, %edx +; CHECK-X64-O2-NEXT: shrb $3, %dl +; CHECK-X64-O2-NEXT: andb $24, %dl +; CHECK-X64-O2-NEXT: negb %dl +; CHECK-X64-O2-NEXT: movsbq %dl, %rdx +; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rsi +; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rdi +; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %r8 +; CHECK-X64-O2-NEXT: movq %r8, %r9 +; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9 +; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %rdx +; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx +; CHECK-X64-O2-NEXT: movq %rsi, %r8 ; CHECK-X64-O2-NEXT: shlq %cl, %r8 -; CHECK-X64-O2-NEXT: notb %cl -; CHECK-X64-O2-NEXT: movq -32(%rsp,%rsi), %r9 -; CHECK-X64-O2-NEXT: movq -24(%rsp,%rsi), %r10 -; CHECK-X64-O2-NEXT: movq %r10, %r11 -; CHECK-X64-O2-NEXT: shrq %r11 -; CHECK-X64-O2-NEXT: shrq %cl, %r11 -; CHECK-X64-O2-NEXT: orq %r8, %r11 -; CHECK-X64-O2-NEXT: movq -8(%rsp,%rsi), %rsi -; CHECK-X64-O2-NEXT: movl %edx, %ecx -; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %rsi -; CHECK-X64-O2-NEXT: movq %r9, %rdi -; CHECK-X64-O2-NEXT: shlq %cl, %rdi -; CHECK-X64-O2-NEXT: shldq %cl, %r9, %r10 -; CHECK-X64-O2-NEXT: movq %rsi, 24(%rax) -; CHECK-X64-O2-NEXT: movq %r10, 8(%rax) -; CHECK-X64-O2-NEXT: movq %rdi, (%rax) -; CHECK-X64-O2-NEXT: movq %r11, 16(%rax) +; CHECK-X64-O2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-X64-O2-NEXT: shldq %cl, %rsi, %rdi +; CHECK-X64-O2-NEXT: movq %rdx, 24(%rax) +; CHECK-X64-O2-NEXT: movq %r9, 16(%rax) +; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax) +; CHECK-X64-O2-NEXT: movq %r8, (%rax) ; CHECK-X64-O2-NEXT: retq { %b = shl i256 1, %c ; %c must not be a constant diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index e5affd86312e..277525796824 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -646,7 +646,869 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; X86-SSE2-LABEL: lshr_16bytes: +; FALLBACK16-LABEL: lshr_16bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $60, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movl (%ecx), %edx +; FALLBACK16-NEXT: movl 4(%ecx), %esi +; FALLBACK16-NEXT: movl 8(%ecx), %edi +; FALLBACK16-NEXT: movl 12(%ecx), %ecx +; FALLBACK16-NEXT: movb (%eax), %ah +; FALLBACK16-NEXT: movb %ah, %al +; FALLBACK16-NEXT: shlb $3, %al +; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $12, %ah +; FALLBACK16-NEXT: movzbl %ah, %ebp +; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi +; FALLBACK16-NEXT: movl %esi, %ebx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl %eax, %edx +; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %ebx, %esi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl %ebx, 12(%edx) +; FALLBACK16-NEXT: movl %ebp, 8(%edx) +; FALLBACK16-NEXT: movl %esi, (%edx) +; FALLBACK16-NEXT: movl %edi, 4(%edx) +; FALLBACK16-NEXT: addl $60, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: lshr_16bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $44, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK17-NEXT: movl (%edx), %esi +; FALLBACK17-NEXT: movl 4(%edx), %edi +; FALLBACK17-NEXT: movl 8(%edx), %ebx +; FALLBACK17-NEXT: movl 12(%edx), %edx +; FALLBACK17-NEXT: movb (%ecx), %ch +; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: shlb $3, %cl +; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, (%esp) +; FALLBACK17-NEXT: andb $12, %ch +; FALLBACK17-NEXT: movzbl %ch, %ebx +; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi +; FALLBACK17-NEXT: movl (%esp,%ebx), %edx +; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp +; FALLBACK17-NEXT: movl %ebp, %edi +; FALLBACK17-NEXT: shrdl %cl, %esi, %edi +; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx +; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx +; FALLBACK17-NEXT: shrl %cl, %ebx +; FALLBACK17-NEXT: movl %esi, 8(%eax) +; FALLBACK17-NEXT: movl %ebx, 12(%eax) +; FALLBACK17-NEXT: movl %edx, (%eax) +; FALLBACK17-NEXT: movl %edi, 4(%eax) +; FALLBACK17-NEXT: addl $44, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: lshr_16bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $44, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl (%ecx), %edx +; FALLBACK18-NEXT: movl 4(%ecx), %esi +; FALLBACK18-NEXT: movl 8(%ecx), %edi +; FALLBACK18-NEXT: movl 12(%ecx), %ecx +; FALLBACK18-NEXT: movzbl (%eax), %ebx +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: xorps %xmm0, %xmm0 +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, (%esp) +; FALLBACK18-NEXT: andb $12, %bl +; FALLBACK18-NEXT: movzbl %bl, %esi +; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi +; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx +; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp +; FALLBACK18-NEXT: movl %eax, %edx +; FALLBACK18-NEXT: notb %dl +; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %ebp, %edi +; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi +; FALLBACK18-NEXT: shrxl %eax, %esi, %eax +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %edx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK18-NEXT: movl %eax, 12(%esi) +; FALLBACK18-NEXT: movl %edx, 8(%esi) +; FALLBACK18-NEXT: movl %edi, (%esi) +; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: addl $44, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: lshr_16bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $44, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK19-NEXT: movl (%edx), %esi +; FALLBACK19-NEXT: movl 4(%edx), %edi +; FALLBACK19-NEXT: movl 8(%edx), %ebx +; FALLBACK19-NEXT: movl 12(%edx), %edx +; FALLBACK19-NEXT: movzbl (%ecx), %eax +; FALLBACK19-NEXT: movl %eax, %ecx +; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: xorps %xmm0, %xmm0 +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, (%esp) +; FALLBACK19-NEXT: andb $12, %al +; FALLBACK19-NEXT: movzbl %al, %eax +; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx +; FALLBACK19-NEXT: movl (%esp,%eax), %edx +; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi +; FALLBACK19-NEXT: movl %esi, %edi +; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi +; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax +; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK19-NEXT: movl %ebx, 8(%ebp) +; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax +; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: shrdl %cl, %esi, %edx +; FALLBACK19-NEXT: movl %edx, (%ebp) +; FALLBACK19-NEXT: movl %edi, 4(%ebp) +; FALLBACK19-NEXT: addl $44, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: lshr_16bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $60, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movzbl (%eax), %ecx +; FALLBACK20-NEXT: movl %ecx, %eax +; FALLBACK20-NEXT: shlb $3, %al +; FALLBACK20-NEXT: xorps %xmm1, %xmm1 +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: andb $12, %cl +; FALLBACK20-NEXT: movzbl %cl, %edi +; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, %esi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %esi, %ebp +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %esi, %ebx +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl %edi, 12(%edx) +; FALLBACK20-NEXT: movl %ebx, 4(%edx) +; FALLBACK20-NEXT: movl %ebp, 8(%edx) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, (%edx) +; FALLBACK20-NEXT: addl $60, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: lshr_16bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $44, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK21-NEXT: movups (%edx), %xmm0 +; FALLBACK21-NEXT: movzbl (%ecx), %edx +; FALLBACK21-NEXT: movl %edx, %ecx +; FALLBACK21-NEXT: shlb $3, %cl +; FALLBACK21-NEXT: xorps %xmm1, %xmm1 +; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, (%esp) +; FALLBACK21-NEXT: andb $12, %dl +; FALLBACK21-NEXT: movzbl %dl, %ebx +; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx +; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp +; FALLBACK21-NEXT: movl %ebp, %edi +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl (%esp,%ebx), %esi +; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax +; FALLBACK21-NEXT: movl %eax, %ebx +; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %ebx, 4(%ebp) +; FALLBACK21-NEXT: movl %edi, 8(%ebp) +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: shrl %cl, %edx +; FALLBACK21-NEXT: movl %edx, 12(%ebp) +; FALLBACK21-NEXT: movl %esi, (%ebp) +; FALLBACK21-NEXT: addl $44, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: lshr_16bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $44, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movzbl (%eax), %ecx +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: xorps %xmm1, %xmm1 +; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, (%esp) +; FALLBACK22-NEXT: andb $12, %cl +; FALLBACK22-NEXT: movzbl %cl, %edi +; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp +; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK22-NEXT: orl %ebx, %edx +; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx +; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK22-NEXT: shrxl %eax, %edi, %eax +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %ebx, %edi +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK22-NEXT: movl %eax, 12(%esi) +; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %edi, 8(%esi) +; FALLBACK22-NEXT: movl %edx, (%esi) +; FALLBACK22-NEXT: addl $44, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: lshr_16bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $44, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK23-NEXT: movups (%edx), %xmm0 +; FALLBACK23-NEXT: movzbl (%ecx), %edx +; FALLBACK23-NEXT: movl %edx, %ecx +; FALLBACK23-NEXT: shlb $3, %cl +; FALLBACK23-NEXT: xorps %xmm1, %xmm1 +; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, (%esp) +; FALLBACK23-NEXT: andb $12, %dl +; FALLBACK23-NEXT: movzbl %dl, %ebx +; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp +; FALLBACK23-NEXT: movl %ebp, %edi +; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl (%esp,%ebx), %esi +; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, %ebx +; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK23-NEXT: movl %ebx, 4(%ebp) +; FALLBACK23-NEXT: movl %edi, 8(%ebp) +; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx +; FALLBACK23-NEXT: movl %edx, 12(%ebp) +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl %esi, (%ebp) +; FALLBACK23-NEXT: addl $44, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: lshr_16bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $60, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK24-NEXT: movzbl (%eax), %ecx +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: shlb $3, %al +; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: andb $12, %cl +; FALLBACK24-NEXT: movzbl %cl, %edi +; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl %ebx, %esi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %esi, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %esi, %ebx +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl %edi, 12(%edx) +; FALLBACK24-NEXT: movl %ebx, 4(%edx) +; FALLBACK24-NEXT: movl %ebp, 8(%edx) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, (%edx) +; FALLBACK24-NEXT: addl $60, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: lshr_16bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $44, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK25-NEXT: vmovups (%edx), %xmm0 +; FALLBACK25-NEXT: movzbl (%ecx), %edx +; FALLBACK25-NEXT: movl %edx, %ecx +; FALLBACK25-NEXT: shlb $3, %cl +; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovaps %xmm0, (%esp) +; FALLBACK25-NEXT: andb $12, %dl +; FALLBACK25-NEXT: movzbl %dl, %ebx +; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx +; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp +; FALLBACK25-NEXT: movl %ebp, %edi +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl (%esp,%ebx), %esi +; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax +; FALLBACK25-NEXT: movl %eax, %ebx +; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %ebx, 4(%ebp) +; FALLBACK25-NEXT: movl %edi, 8(%ebp) +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: shrl %cl, %edx +; FALLBACK25-NEXT: movl %edx, 12(%ebp) +; FALLBACK25-NEXT: movl %esi, (%ebp) +; FALLBACK25-NEXT: addl $44, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: lshr_16bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $44, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK26-NEXT: movzbl (%eax), %ecx +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: shlb $3, %al +; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) +; FALLBACK26-NEXT: andb $12, %cl +; FALLBACK26-NEXT: movzbl %cl, %edi +; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: notb %cl +; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp +; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK26-NEXT: orl %ebx, %edx +; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %eax +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK26-NEXT: orl %ebx, %edi +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK26-NEXT: movl %eax, 12(%esi) +; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %edi, 8(%esi) +; FALLBACK26-NEXT: movl %edx, (%esi) +; FALLBACK26-NEXT: addl $44, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: lshr_16bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $44, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK27-NEXT: vmovups (%edx), %xmm0 +; FALLBACK27-NEXT: movzbl (%ecx), %edx +; FALLBACK27-NEXT: movl %edx, %ecx +; FALLBACK27-NEXT: shlb $3, %cl +; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovaps %xmm0, (%esp) +; FALLBACK27-NEXT: andb $12, %dl +; FALLBACK27-NEXT: movzbl %dl, %ebx +; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp +; FALLBACK27-NEXT: movl %ebp, %edi +; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl (%esp,%ebx), %esi +; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, %ebx +; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK27-NEXT: movl %ebx, 4(%ebp) +; FALLBACK27-NEXT: movl %edi, 8(%ebp) +; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx +; FALLBACK27-NEXT: movl %edx, 12(%ebp) +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl %esi, (%ebp) +; FALLBACK27-NEXT: addl $44, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: lshr_16bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $60, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK28-NEXT: movzbl (%eax), %ecx +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: shlb $3, %al +; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: andb $12, %cl +; FALLBACK28-NEXT: movzbl %cl, %edi +; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, %esi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %esi, %ebp +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %esi, %ebx +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl %edi, 12(%edx) +; FALLBACK28-NEXT: movl %ebx, 4(%edx) +; FALLBACK28-NEXT: movl %ebp, 8(%edx) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, (%edx) +; FALLBACK28-NEXT: addl $60, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: lshr_16bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $44, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK29-NEXT: vmovups (%edx), %xmm0 +; FALLBACK29-NEXT: movzbl (%ecx), %edx +; FALLBACK29-NEXT: movl %edx, %ecx +; FALLBACK29-NEXT: shlb $3, %cl +; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovaps %xmm0, (%esp) +; FALLBACK29-NEXT: andb $12, %dl +; FALLBACK29-NEXT: movzbl %dl, %ebx +; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx +; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp +; FALLBACK29-NEXT: movl %ebp, %edi +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl (%esp,%ebx), %esi +; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax +; FALLBACK29-NEXT: movl %eax, %ebx +; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %ebx, 4(%ebp) +; FALLBACK29-NEXT: movl %edi, 8(%ebp) +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: shrl %cl, %edx +; FALLBACK29-NEXT: movl %edx, 12(%ebp) +; FALLBACK29-NEXT: movl %esi, (%ebp) +; FALLBACK29-NEXT: addl $44, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: lshr_16bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $44, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK30-NEXT: movzbl (%eax), %ecx +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: shlb $3, %al +; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) +; FALLBACK30-NEXT: andb $12, %cl +; FALLBACK30-NEXT: movzbl %cl, %edi +; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: notb %cl +; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp +; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK30-NEXT: orl %ebx, %edx +; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %eax +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK30-NEXT: orl %ebx, %edi +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK30-NEXT: movl %eax, 12(%esi) +; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %edi, 8(%esi) +; FALLBACK30-NEXT: movl %edx, (%esi) +; FALLBACK30-NEXT: addl $44, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: lshr_16bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $44, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK31-NEXT: vmovups (%edx), %xmm0 +; FALLBACK31-NEXT: movzbl (%ecx), %edx +; FALLBACK31-NEXT: movl %edx, %ecx +; FALLBACK31-NEXT: shlb $3, %cl +; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovaps %xmm0, (%esp) +; FALLBACK31-NEXT: andb $12, %dl +; FALLBACK31-NEXT: movzbl %dl, %ebx +; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp +; FALLBACK31-NEXT: movl %ebp, %edi +; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl (%esp,%ebx), %esi +; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, %ebx +; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK31-NEXT: movl %ebx, 4(%ebp) +; FALLBACK31-NEXT: movl %edi, 8(%ebp) +; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx +; FALLBACK31-NEXT: movl %edx, 12(%ebp) +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl %esi, (%ebp) +; FALLBACK31-NEXT: addl $44, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: retl + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = lshr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl +; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al +; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %rdi, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %rdi, %rcx +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %edi, %edi +; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl +; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl +; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi +; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil +; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi +; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi +; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl +; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi +; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq +; +; X86-SSE2-LABEL: lshr_16bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi @@ -660,19 +1522,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 8(%edx), %ebx ; X86-SSE2-NEXT: movl 12(%edx), %edx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, (%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $15, %ecx -; X86-SSE2-NEXT: movl (%esp,%ecx), %edx -; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx +; X86-SSE2-NEXT: andl $3, %ecx +; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx +; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi +; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi +; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %edi, 12(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) @@ -683,46 +1543,47 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: lshr_16bytes: +; X86-SSE42-LABEL: lshr_16bytes_dwordOff: ; X86-SSE42: # %bb.0: -; X86-SSE42-NEXT: subl $32, %esp +; X86-SSE42-NEXT: subl $44, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm1, %xmm1 -; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, (%esp) -; X86-SSE42-NEXT: andl $15, %ecx -; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0 +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) +; X86-SSE42-NEXT: andl $3, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 ; X86-SSE42-NEXT: movups %xmm0, (%eax) -; X86-SSE42-NEXT: addl $32, %esp +; X86-SSE42-NEXT: addl $44, %esp ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: lshr_16bytes: +; X86-AVX-LABEL: lshr_16bytes_dwordOff: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $32, %esp +; X86-AVX-NEXT: subl $44, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %xmm0, (%esp) -; X86-AVX-NEXT: andl $15, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 +; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovaps %xmm0, (%esp) +; X86-AVX-NEXT: andl $3, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 ; X86-AVX-NEXT: vmovups %xmm0, (%eax) -; X86-AVX-NEXT: addl $32, %esp +; X86-AVX-NEXT: addl $44, %esp ; X86-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 - %byteOff = load i128, ptr %byteOff.ptr, align 1 - %bitOff = shl i128 %byteOff, 3 + %dwordOff = load i128, ptr %dwordOff.ptr, align 1 + %bitOff = shl i128 %dwordOff, 5 %res = lshr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } + define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes: ; X64-NO-SHLD-NO-BMI2: # %bb.0: @@ -800,7 +1661,877 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; X86-SSE2-LABEL: shl_16bytes: +; FALLBACK16-LABEL: shl_16bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $60, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movl (%ecx), %ebx +; FALLBACK16-NEXT: movl 4(%ecx), %esi +; FALLBACK16-NEXT: movl 8(%ecx), %edi +; FALLBACK16-NEXT: movl 12(%ecx), %ecx +; FALLBACK16-NEXT: movb (%eax), %ah +; FALLBACK16-NEXT: movb %ah, %dh +; FALLBACK16-NEXT: shlb $3, %dh +; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $12, %ah +; FALLBACK16-NEXT: negb %ah +; FALLBACK16-NEXT: movsbl %ah, %ebp +; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi +; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movb %dh, %dl +; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: shrl %ebx +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi +; FALLBACK16-NEXT: movl %edi, %ebp +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: orl %edi, %esi +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl %edx, (%eax) +; FALLBACK16-NEXT: movl %esi, 8(%eax) +; FALLBACK16-NEXT: movl %ebp, 12(%eax) +; FALLBACK16-NEXT: movl %ebx, 4(%eax) +; FALLBACK16-NEXT: addl $60, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: shl_16bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $32, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK17-NEXT: movl (%edx), %esi +; FALLBACK17-NEXT: movl 4(%edx), %edi +; FALLBACK17-NEXT: movl 8(%edx), %ebx +; FALLBACK17-NEXT: movl 12(%edx), %edx +; FALLBACK17-NEXT: movb (%ecx), %ch +; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: shlb $3, %cl +; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: movaps %xmm0, (%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: andb $12, %ch +; FALLBACK17-NEXT: negb %ch +; FALLBACK17-NEXT: movsbl %ch, %edi +; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK17-NEXT: shldl %cl, %edi, %esi +; FALLBACK17-NEXT: shldl %cl, %ebx, %edi +; FALLBACK17-NEXT: shll %cl, %ebx +; FALLBACK17-NEXT: movl %esi, 8(%eax) +; FALLBACK17-NEXT: movl %edx, 12(%eax) +; FALLBACK17-NEXT: movl %ebx, (%eax) +; FALLBACK17-NEXT: movl %edi, 4(%eax) +; FALLBACK17-NEXT: addl $32, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: shl_16bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $44, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl (%ecx), %edx +; FALLBACK18-NEXT: movl 4(%ecx), %esi +; FALLBACK18-NEXT: movl 8(%ecx), %edi +; FALLBACK18-NEXT: movl 12(%ecx), %ecx +; FALLBACK18-NEXT: movzbl (%eax), %eax +; FALLBACK18-NEXT: movl %eax, %ebx +; FALLBACK18-NEXT: shlb $3, %bl +; FALLBACK18-NEXT: xorps %xmm0, %xmm0 +; FALLBACK18-NEXT: movaps %xmm0, (%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: andb $12, %al +; FALLBACK18-NEXT: negb %al +; FALLBACK18-NEXT: movsbl %al, %edx +; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi +; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx +; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: notb %al +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %eax, %edi, %edi +; FALLBACK18-NEXT: orl %esi, %edi +; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi +; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx +; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx +; FALLBACK18-NEXT: shrl %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edx +; FALLBACK18-NEXT: orl %esi, %edx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl %ebp, (%ecx) +; FALLBACK18-NEXT: movl %eax, 8(%ecx) +; FALLBACK18-NEXT: movl %edx, 12(%ecx) +; FALLBACK18-NEXT: movl %edi, 4(%ecx) +; FALLBACK18-NEXT: addl $44, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: shl_16bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $44, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK19-NEXT: movl (%edx), %esi +; FALLBACK19-NEXT: movl 4(%edx), %edi +; FALLBACK19-NEXT: movl 8(%edx), %ebx +; FALLBACK19-NEXT: movl 12(%edx), %edx +; FALLBACK19-NEXT: movzbl (%ecx), %eax +; FALLBACK19-NEXT: movl %eax, %ecx +; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: xorps %xmm0, %xmm0 +; FALLBACK19-NEXT: movaps %xmm0, (%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: andb $12, %al +; FALLBACK19-NEXT: negb %al +; FALLBACK19-NEXT: movsbl %al, %eax +; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi +; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx +; FALLBACK19-NEXT: shldl %cl, %esi, %edx +; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi +; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax +; FALLBACK19-NEXT: shldl %cl, %eax, %esi +; FALLBACK19-NEXT: shldl %cl, %edi, %eax +; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx +; FALLBACK19-NEXT: movl %esi, 8(%ebp) +; FALLBACK19-NEXT: movl %edx, 12(%ebp) +; FALLBACK19-NEXT: movl %ecx, (%ebp) +; FALLBACK19-NEXT: movl %eax, 4(%ebp) +; FALLBACK19-NEXT: addl $44, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: shl_16bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $60, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movzbl (%eax), %ecx +; FALLBACK20-NEXT: movl %ecx, %eax +; FALLBACK20-NEXT: shlb $3, %al +; FALLBACK20-NEXT: xorps %xmm1, %xmm1 +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: andb $12, %cl +; FALLBACK20-NEXT: negb %cl +; FALLBACK20-NEXT: movsbl %cl, %edi +; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, %edi +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: movl %eax, (%edx) +; FALLBACK20-NEXT: movl %ebp, 4(%edx) +; FALLBACK20-NEXT: movl %edi, 8(%edx) +; FALLBACK20-NEXT: movl %esi, 12(%edx) +; FALLBACK20-NEXT: addl $60, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: shl_16bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $44, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK21-NEXT: movups (%edx), %xmm0 +; FALLBACK21-NEXT: movzbl (%ecx), %edx +; FALLBACK21-NEXT: movl %edx, %ecx +; FALLBACK21-NEXT: shlb $3, %cl +; FALLBACK21-NEXT: xorps %xmm1, %xmm1 +; FALLBACK21-NEXT: movaps %xmm1, (%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: andb $12, %dl +; FALLBACK21-NEXT: negb %dl +; FALLBACK21-NEXT: movsbl %dl, %edi +; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK21-NEXT: shldl %cl, %esi, %edx +; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK21-NEXT: shldl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %ebx, %ebp +; FALLBACK21-NEXT: shll %cl, %ebp +; FALLBACK21-NEXT: shldl %cl, %ebx, %edi +; FALLBACK21-NEXT: movl %edi, 4(%eax) +; FALLBACK21-NEXT: movl %esi, 8(%eax) +; FALLBACK21-NEXT: movl %edx, 12(%eax) +; FALLBACK21-NEXT: movl %ebp, (%eax) +; FALLBACK21-NEXT: addl $44, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: shl_16bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $44, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movzbl (%eax), %ecx +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: xorps %xmm1, %xmm1 +; FALLBACK22-NEXT: movaps %xmm1, (%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: andb $12, %cl +; FALLBACK22-NEXT: negb %cl +; FALLBACK22-NEXT: movsbl %cl, %ecx +; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx +; FALLBACK22-NEXT: shlxl %eax, %edx, %edi +; FALLBACK22-NEXT: movl %eax, %ebx +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrl %edx +; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK22-NEXT: orl %esi, %edx +; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl %esi, %ebp +; FALLBACK22-NEXT: shrl %ebp +; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shlxl %eax, %esi, %esi +; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx +; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: orl %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK22-NEXT: movl %eax, (%esi) +; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %ebp, 8(%esi) +; FALLBACK22-NEXT: movl %edx, 12(%esi) +; FALLBACK22-NEXT: addl $44, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: shl_16bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $44, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK23-NEXT: movups (%edx), %xmm0 +; FALLBACK23-NEXT: movzbl (%ecx), %edx +; FALLBACK23-NEXT: movl %edx, %ecx +; FALLBACK23-NEXT: shlb $3, %cl +; FALLBACK23-NEXT: xorps %xmm1, %xmm1 +; FALLBACK23-NEXT: movaps %xmm1, (%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: andb $12, %dl +; FALLBACK23-NEXT: negb %dl +; FALLBACK23-NEXT: movsbl %dl, %edi +; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK23-NEXT: shldl %cl, %esi, %edx +; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK23-NEXT: shldl %cl, %edi, %esi +; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: shldl %cl, %ebx, %edi +; FALLBACK23-NEXT: movl %edi, 4(%eax) +; FALLBACK23-NEXT: movl %esi, 8(%eax) +; FALLBACK23-NEXT: movl %edx, 12(%eax) +; FALLBACK23-NEXT: movl %ebp, (%eax) +; FALLBACK23-NEXT: addl $44, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: shl_16bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $60, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK24-NEXT: movzbl (%eax), %ecx +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: shlb $3, %al +; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: andb $12, %cl +; FALLBACK24-NEXT: negb %cl +; FALLBACK24-NEXT: movsbl %cl, %edi +; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl %ebx, %edi +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: movl %eax, (%edx) +; FALLBACK24-NEXT: movl %ebp, 4(%edx) +; FALLBACK24-NEXT: movl %edi, 8(%edx) +; FALLBACK24-NEXT: movl %esi, 12(%edx) +; FALLBACK24-NEXT: addl $60, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: shl_16bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $44, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK25-NEXT: vmovups (%edx), %xmm0 +; FALLBACK25-NEXT: movzbl (%ecx), %edx +; FALLBACK25-NEXT: movl %edx, %ecx +; FALLBACK25-NEXT: shlb $3, %cl +; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK25-NEXT: vmovaps %xmm1, (%esp) +; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: andb $12, %dl +; FALLBACK25-NEXT: negb %dl +; FALLBACK25-NEXT: movsbl %dl, %edi +; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK25-NEXT: shldl %cl, %esi, %edx +; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK25-NEXT: shldl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %ebx, %ebp +; FALLBACK25-NEXT: shll %cl, %ebp +; FALLBACK25-NEXT: shldl %cl, %ebx, %edi +; FALLBACK25-NEXT: movl %edi, 4(%eax) +; FALLBACK25-NEXT: movl %esi, 8(%eax) +; FALLBACK25-NEXT: movl %edx, 12(%eax) +; FALLBACK25-NEXT: movl %ebp, (%eax) +; FALLBACK25-NEXT: addl $44, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: shl_16bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $44, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK26-NEXT: movzbl (%eax), %ecx +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: shlb $3, %al +; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) +; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: andb $12, %cl +; FALLBACK26-NEXT: negb %cl +; FALLBACK26-NEXT: movsbl %cl, %ecx +; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi +; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx +; FALLBACK26-NEXT: shlxl %eax, %edx, %edi +; FALLBACK26-NEXT: movl %eax, %ebx +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrl %edx +; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK26-NEXT: orl %esi, %edx +; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi +; FALLBACK26-NEXT: movl %esi, %ebp +; FALLBACK26-NEXT: shrl %ebp +; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shlxl %eax, %esi, %esi +; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx +; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK26-NEXT: movl %eax, (%esi) +; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %ebp, 8(%esi) +; FALLBACK26-NEXT: movl %edx, 12(%esi) +; FALLBACK26-NEXT: addl $44, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: shl_16bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $44, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK27-NEXT: vmovups (%edx), %xmm0 +; FALLBACK27-NEXT: movzbl (%ecx), %edx +; FALLBACK27-NEXT: movl %edx, %ecx +; FALLBACK27-NEXT: shlb $3, %cl +; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK27-NEXT: vmovaps %xmm1, (%esp) +; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: andb $12, %dl +; FALLBACK27-NEXT: negb %dl +; FALLBACK27-NEXT: movsbl %dl, %edi +; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK27-NEXT: shldl %cl, %esi, %edx +; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK27-NEXT: shldl %cl, %edi, %esi +; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: shldl %cl, %ebx, %edi +; FALLBACK27-NEXT: movl %edi, 4(%eax) +; FALLBACK27-NEXT: movl %esi, 8(%eax) +; FALLBACK27-NEXT: movl %edx, 12(%eax) +; FALLBACK27-NEXT: movl %ebp, (%eax) +; FALLBACK27-NEXT: addl $44, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: shl_16bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $60, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK28-NEXT: movzbl (%eax), %ecx +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: shlb $3, %al +; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: andb $12, %cl +; FALLBACK28-NEXT: negb %cl +; FALLBACK28-NEXT: movsbl %cl, %edi +; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, %edi +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: movl %eax, (%edx) +; FALLBACK28-NEXT: movl %ebp, 4(%edx) +; FALLBACK28-NEXT: movl %edi, 8(%edx) +; FALLBACK28-NEXT: movl %esi, 12(%edx) +; FALLBACK28-NEXT: addl $60, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: shl_16bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $44, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK29-NEXT: vmovups (%edx), %xmm0 +; FALLBACK29-NEXT: movzbl (%ecx), %edx +; FALLBACK29-NEXT: movl %edx, %ecx +; FALLBACK29-NEXT: shlb $3, %cl +; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK29-NEXT: vmovaps %xmm1, (%esp) +; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: andb $12, %dl +; FALLBACK29-NEXT: negb %dl +; FALLBACK29-NEXT: movsbl %dl, %edi +; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK29-NEXT: shldl %cl, %esi, %edx +; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK29-NEXT: shldl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %ebx, %ebp +; FALLBACK29-NEXT: shll %cl, %ebp +; FALLBACK29-NEXT: shldl %cl, %ebx, %edi +; FALLBACK29-NEXT: movl %edi, 4(%eax) +; FALLBACK29-NEXT: movl %esi, 8(%eax) +; FALLBACK29-NEXT: movl %edx, 12(%eax) +; FALLBACK29-NEXT: movl %ebp, (%eax) +; FALLBACK29-NEXT: addl $44, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: shl_16bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $44, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK30-NEXT: movzbl (%eax), %ecx +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: shlb $3, %al +; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) +; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: andb $12, %cl +; FALLBACK30-NEXT: negb %cl +; FALLBACK30-NEXT: movsbl %cl, %ecx +; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx +; FALLBACK30-NEXT: shlxl %eax, %edx, %edi +; FALLBACK30-NEXT: movl %eax, %ebx +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrl %edx +; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK30-NEXT: orl %esi, %edx +; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: shrl %ebp +; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shlxl %eax, %esi, %esi +; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx +; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: orl %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK30-NEXT: movl %eax, (%esi) +; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %ebp, 8(%esi) +; FALLBACK30-NEXT: movl %edx, 12(%esi) +; FALLBACK30-NEXT: addl $44, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: shl_16bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $44, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK31-NEXT: vmovups (%edx), %xmm0 +; FALLBACK31-NEXT: movzbl (%ecx), %edx +; FALLBACK31-NEXT: movl %edx, %ecx +; FALLBACK31-NEXT: shlb $3, %cl +; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK31-NEXT: vmovaps %xmm1, (%esp) +; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: andb $12, %dl +; FALLBACK31-NEXT: negb %dl +; FALLBACK31-NEXT: movsbl %dl, %edi +; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi +; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx +; FALLBACK31-NEXT: shldl %cl, %esi, %edx +; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx +; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi +; FALLBACK31-NEXT: shldl %cl, %edi, %esi +; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: shldl %cl, %ebx, %edi +; FALLBACK31-NEXT: movl %edi, 4(%eax) +; FALLBACK31-NEXT: movl %esi, 8(%eax) +; FALLBACK31-NEXT: movl %edx, 12(%eax) +; FALLBACK31-NEXT: movl %ebp, (%eax) +; FALLBACK31-NEXT: addl $44, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: retl + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = shl i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: shrq %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl +; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: orq %rdi, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al +; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rcx +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, (%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, %rsi +; X64-HAVE-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-NEXT: shldq %cl, %rax, %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %eax, %eax +; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl +; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, 8(%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi +; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %r8 +; X64-NO-SHLD-HAVE-BMI2-NEXT: shrq %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rdi, %rax, %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi +; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl +; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %r8, %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %r8, %rsi +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shldq %cl, %rax, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq +; +; X86-SSE2-LABEL: shl_16bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi @@ -814,15 +2545,14 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 8(%edx), %ebx ; X86-SSE2-NEXT: movl 12(%edx), %edx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, (%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, (%esp) -; X86-SSE2-NEXT: andb $15, %cl +; X86-SSE2-NEXT: shlb $2, %cl +; X86-SSE2-NEXT: andb $12, %cl ; X86-SSE2-NEXT: negb %cl ; X86-SSE2-NEXT: movsbl %cl, %ecx ; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx @@ -839,50 +2569,53 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: shl_16bytes: +; X86-SSE42-LABEL: shl_16bytes_dwordOff: ; X86-SSE42: # %bb.0: -; X86-SSE42-NEXT: subl $32, %esp +; X86-SSE42-NEXT: subl $44, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm1, %xmm1 -; X86-SSE42-NEXT: movups %xmm1, (%esp) -; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: andb $15, %cl +; X86-SSE42-NEXT: movaps %xmm1, (%esp) +; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: shlb $2, %cl +; X86-SSE42-NEXT: andb $12, %cl ; X86-SSE42-NEXT: negb %cl ; X86-SSE42-NEXT: movsbl %cl, %ecx ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm0 ; X86-SSE42-NEXT: movups %xmm0, (%eax) -; X86-SSE42-NEXT: addl $32, %esp +; X86-SSE42-NEXT: addl $44, %esp ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: shl_16bytes: +; X86-AVX-LABEL: shl_16bytes_dwordOff: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $32, %esp +; X86-AVX-NEXT: subl $44, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, (%esp) -; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andb $15, %cl +; X86-AVX-NEXT: vmovaps %xmm1, (%esp) +; X86-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: shlb $2, %cl +; X86-AVX-NEXT: andb $12, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx ; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm0 ; X86-AVX-NEXT: vmovups %xmm0, (%eax) -; X86-AVX-NEXT: addl $32, %esp +; X86-AVX-NEXT: addl $44, %esp ; X86-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 - %byteOff = load i128, ptr %byteOff.ptr, align 1 - %bitOff = shl i128 %byteOff, 3 + %dwordOff = load i128, ptr %dwordOff.ptr, align 1 + %bitOff = shl i128 %dwordOff, 5 %res = shl i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } + define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes: ; X64-NO-SHLD-NO-BMI2: # %bb.0: @@ -960,7 +2693,312 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; -; X86-SSE2-LABEL: ashr_16bytes: +; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes: +; X86-NO-SHLD-NO-BMI2: # %bb.0: +; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi +; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi +; X86-NO-SHLD-NO-BMI2-NEXT: subl $60, %esp +; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-NO-BMI2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: movb (%eax), %ah +; X86-NO-SHLD-NO-BMI2-NEXT: movb %ah, %al +; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-NO-BMI2-NEXT: andb $12, %ah +; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %ah, %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebp), %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %edx +; X86-NO-SHLD-NO-BMI2-NEXT: notb %dl +; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebp), %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi +; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: addl %esi, %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi +; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %esi +; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebp, 8(%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, (%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx) +; X86-NO-SHLD-NO-BMI2-NEXT: addl $60, %esp +; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi +; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi +; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx +; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebp +; X86-NO-SHLD-NO-BMI2-NEXT: retl +; +; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes: +; X86-HAVE-SHLD-NO-BMI2: # %bb.0: +; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebp +; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movb (%ecx), %ch +; X86-HAVE-SHLD-NO-BMI2-NEXT: movb %ch, %cl +; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %ch +; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %ch, %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%ebx), %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%ebx), %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%ebx), %ebp +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebp, %edx +; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 8(%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, 4(%eax) +; X86-HAVE-SHLD-NO-BMI2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi +; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi +; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebx +; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebp +; X86-HAVE-SHLD-NO-BMI2-NEXT: retl +; +; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes: +; X86-NO-SHLD-HAVE-BMI2: # %bb.0: +; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: subl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%ecx), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%ecx), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%ecx), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%ecx), %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%eax), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %al +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, (%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp +; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: retl +; +; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes: +; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: subl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%edx), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%edx), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: andb $12, %al +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %al, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%eax), %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%eax), %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%eax), %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%eax), %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 8(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %eax, %eax +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%ebp) +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: addl $44, %esp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebx +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebp +; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl + %src = load i128, ptr %src.ptr, align 1 + %byteOff = load i128, ptr %byteOff.ptr, align 1 + %bitOff = shl i128 %byteOff, 3 + %res = ashr i128 %src, %bitOff + store i128 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff: +; X64-NO-SHLD-NO-BMI2: # %bb.0: +; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax +; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al +; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl +; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8 +; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al +; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi +; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) +; X64-NO-SHLD-NO-BMI2-NEXT: retq +; +; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff: +; X64-HAVE-SHLD-NO-BMI2: # %bb.0: +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi +; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq %cl, %rsi +; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl +; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax +; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) +; X64-HAVE-SHLD-NO-BMI2-NEXT: retq +; +; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff: +; X64-NO-SHLD-HAVE-BMI2: # %bb.0: +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl +; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi +; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi +; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil +; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8 +; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi +; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi +; X64-NO-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rax, %rsi +; X64-NO-SHLD-HAVE-BMI2-NEXT: sarq $63, %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl +; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rdi +; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rax +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx) +; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx) +; X64-NO-SHLD-HAVE-BMI2-NEXT: retq +; +; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff: +; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rdi, %rsi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarq $63, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) +; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq +; +; X86-SSE2-LABEL: ashr_16bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi @@ -983,11 +3021,11 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $15, %ecx -; X86-SSE2-NEXT: movl (%esp,%ecx), %edx -; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx +; X86-SSE2-NEXT: andl $3, %ecx +; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx +; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi +; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi +; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %edi, 12(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) @@ -998,7 +3036,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: ashr_16bytes: +; X86-SSE42-LABEL: ashr_16bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi @@ -1021,8 +3059,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: andl $15, %ecx -; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0 +; X86-SSE42-NEXT: andl $3, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $32, %esp ; X86-SSE42-NEXT: popl %esi @@ -1030,7 +3068,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: ashr_16bytes: +; X86-AVX-LABEL: ashr_16bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi @@ -1053,8 +3091,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andl $15, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 +; X86-AVX-NEXT: andl $3, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $32, %esp ; X86-AVX-NEXT: popl %esi @@ -1062,84 +3100,2731 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: popl %ebx ; X86-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 - %byteOff = load i128, ptr %byteOff.ptr, align 1 - %bitOff = shl i128 %byteOff, 3 + %dwordOff = load i128, ptr %dwordOff.ptr, align 1 + %bitOff = shl i128 %dwordOff, 5 %res = ashr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; X64-SSE2-LABEL: lshr_32bytes: +; FALLBACK0-LABEL: lshr_32bytes: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rcx +; FALLBACK0-NEXT: movq 8(%rdi), %r8 +; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movq 24(%rdi), %rdi +; FALLBACK0-NEXT: movzbl (%rsi), %esi +; FALLBACK0-NEXT: leal (,%rsi,8), %eax +; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: andb $24, %sil +; FALLBACK0-NEXT: movzbl %sil, %r9d +; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi +; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx +; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq %r11, %r8 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %rdi +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, 24(%rdx) +; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %rdi, (%rdx) +; FALLBACK0-NEXT: movq %r8, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: lshr_32bytes: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %rdi +; FALLBACK1-NEXT: movzbl (%rsi), %esi +; FALLBACK1-NEXT: leal (,%rsi,8), %ecx +; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: andb $24, %sil +; FALLBACK1-NEXT: movzbl %sil, %eax +; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK1-NEXT: movq %r8, %r9 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK1-NEXT: shrq %cl, %rax +; FALLBACK1-NEXT: movq %rsi, 16(%rdx) +; FALLBACK1-NEXT: movq %rax, 24(%rdx) +; FALLBACK1-NEXT: movq %rdi, (%rdx) +; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: lshr_32bytes: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %rdi +; FALLBACK2-NEXT: movzbl (%rsi), %esi +; FALLBACK2-NEXT: leal (,%rsi,8), %eax +; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: andb $24, %sil +; FALLBACK2-NEXT: movzbl %sil, %ecx +; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi +; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: addq %rsi, %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: addq %rcx, %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: orq %r10, %rax +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %rax, 16(%rdx) +; FALLBACK2-NEXT: movq %rsi, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: lshr_32bytes: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %rdi +; FALLBACK3-NEXT: movzbl (%rsi), %esi +; FALLBACK3-NEXT: leal (,%rsi,8), %ecx +; FALLBACK3-NEXT: xorps %xmm0, %xmm0 +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: andb $24, %sil +; FALLBACK3-NEXT: movzbl %sil, %eax +; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK3-NEXT: movq %r8, %r9 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax +; FALLBACK3-NEXT: movq %rsi, 16(%rdx) +; FALLBACK3-NEXT: movq %rax, 24(%rdx) +; FALLBACK3-NEXT: movq %rdi, (%rdx) +; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: lshr_32bytes: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movzbl (%rsi), %ecx +; FALLBACK4-NEXT: leal (,%rcx,8), %eax +; FALLBACK4-NEXT: xorps %xmm2, %xmm2 +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: andb $24, %cl +; FALLBACK4-NEXT: movzbl %cl, %r9d +; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rdi +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 +; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r8 +; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, 24(%rdx) +; FALLBACK4-NEXT: movq %r10, 8(%rdx) +; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %rdi, (%rdx) +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: lshr_32bytes: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movzbl (%rsi), %eax +; FALLBACK5-NEXT: leal (,%rax,8), %ecx +; FALLBACK5-NEXT: xorps %xmm2, %xmm2 +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: andb $24, %al +; FALLBACK5-NEXT: movzbl %al, %eax +; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK5-NEXT: movq %rdi, %r8 +; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK5-NEXT: shrq %cl, %rsi +; FALLBACK5-NEXT: movq %r10, 8(%rdx) +; FALLBACK5-NEXT: movq %r8, 16(%rdx) +; FALLBACK5-NEXT: movq %rsi, 24(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: lshr_32bytes: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movzbl (%rsi), %ecx +; FALLBACK6-NEXT: leal (,%rcx,8), %eax +; FALLBACK6-NEXT: xorps %xmm2, %xmm2 +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: andb $24, %cl +; FALLBACK6-NEXT: movzbl %cl, %ecx +; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: addq %rdi, %rdi +; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK6-NEXT: orq %rsi, %rdi +; FALLBACK6-NEXT: addq %rcx, %rcx +; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK6-NEXT: orq %r9, %rcx +; FALLBACK6-NEXT: addq %r8, %r8 +; FALLBACK6-NEXT: shlxq %rax, %r8, %rax +; FALLBACK6-NEXT: orq %r10, %rax +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %rax, 8(%rdx) +; FALLBACK6-NEXT: movq %rcx, 16(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: lshr_32bytes: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movzbl (%rsi), %eax +; FALLBACK7-NEXT: leal (,%rax,8), %ecx +; FALLBACK7-NEXT: xorps %xmm2, %xmm2 +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: andb $24, %al +; FALLBACK7-NEXT: movzbl %al, %eax +; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK7-NEXT: movq %rdi, %r8 +; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq %r10, 8(%rdx) +; FALLBACK7-NEXT: movq %r8, 16(%rdx) +; FALLBACK7-NEXT: movq %rax, 24(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: lshr_32bytes: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: movzbl (%rsi), %ecx +; FALLBACK8-NEXT: leal (,%rcx,8), %eax +; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: andb $24, %cl +; FALLBACK8-NEXT: movzbl %cl, %r9d +; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rdi +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 +; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, 24(%rdx) +; FALLBACK8-NEXT: movq %r10, 8(%rdx) +; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %rdi, (%rdx) +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: lshr_32bytes: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: movzbl (%rsi), %eax +; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: andb $24, %al +; FALLBACK9-NEXT: movzbl %al, %eax +; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK9-NEXT: movq %rdi, %r8 +; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK9-NEXT: shrq %cl, %rsi +; FALLBACK9-NEXT: movq %r10, 8(%rdx) +; FALLBACK9-NEXT: movq %r8, 16(%rdx) +; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: lshr_32bytes: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: movzbl (%rsi), %ecx +; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: andb $24, %cl +; FALLBACK10-NEXT: movzbl %cl, %ecx +; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: addq %rdi, %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %rsi, %rdi +; FALLBACK10-NEXT: addq %rcx, %rcx +; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK10-NEXT: orq %r9, %rcx +; FALLBACK10-NEXT: addq %r8, %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %rcx, 16(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: lshr_32bytes: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: movzbl (%rsi), %eax +; FALLBACK11-NEXT: leal (,%rax,8), %ecx +; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: andb $24, %al +; FALLBACK11-NEXT: movzbl %al, %eax +; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK11-NEXT: movq %rdi, %r8 +; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq %r10, 8(%rdx) +; FALLBACK11-NEXT: movq %r8, 16(%rdx) +; FALLBACK11-NEXT: movq %rax, 24(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: lshr_32bytes: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK12-NEXT: movzbl (%rsi), %ecx +; FALLBACK12-NEXT: leal (,%rcx,8), %eax +; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: andb $24, %cl +; FALLBACK12-NEXT: movzbl %cl, %r9d +; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rdi +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 +; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, 24(%rdx) +; FALLBACK12-NEXT: movq %r10, 8(%rdx) +; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %rdi, (%rdx) +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: lshr_32bytes: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK13-NEXT: movzbl (%rsi), %eax +; FALLBACK13-NEXT: leal (,%rax,8), %ecx +; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: andb $24, %al +; FALLBACK13-NEXT: movzbl %al, %eax +; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK13-NEXT: movq %rdi, %r8 +; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK13-NEXT: shrq %cl, %rsi +; FALLBACK13-NEXT: movq %r10, 8(%rdx) +; FALLBACK13-NEXT: movq %r8, 16(%rdx) +; FALLBACK13-NEXT: movq %rsi, 24(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: lshr_32bytes: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK14-NEXT: movzbl (%rsi), %ecx +; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: andb $24, %cl +; FALLBACK14-NEXT: movzbl %cl, %ecx +; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: addq %rdi, %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %rsi, %rdi +; FALLBACK14-NEXT: addq %rcx, %rcx +; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK14-NEXT: orq %r9, %rcx +; FALLBACK14-NEXT: addq %r8, %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %rax +; FALLBACK14-NEXT: orq %r10, %rax +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rcx, 16(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: lshr_32bytes: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK15-NEXT: movzbl (%rsi), %eax +; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: andb $24, %al +; FALLBACK15-NEXT: movzbl %al, %eax +; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK15-NEXT: movq %rdi, %r8 +; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq %r10, 8(%rdx) +; FALLBACK15-NEXT: movq %r8, 16(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; FALLBACK16-LABEL: lshr_32bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $108, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK16-NEXT: movl (%ebp), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%ebp), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 8(%ebp), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 12(%ebp), %edi +; FALLBACK16-NEXT: movl 16(%ebp), %ebx +; FALLBACK16-NEXT: movb (%eax), %ah +; FALLBACK16-NEXT: movl 20(%ebp), %esi +; FALLBACK16-NEXT: movl 24(%ebp), %ecx +; FALLBACK16-NEXT: movl 28(%ebp), %ebp +; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movb %ah, %dh +; FALLBACK16-NEXT: shlb $3, %dh +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $28, %ah +; FALLBACK16-NEXT: movzbl %ah, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi +; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax +; FALLBACK16-NEXT: movl %eax, %ebx +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movb %dh, %dl +; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %ebp +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl %eax, %ebx +; FALLBACK16-NEXT: addl %eax, %ebx +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %esi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp +; FALLBACK16-NEXT: movl %ebp, %esi +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%eax,%eax), %ebx +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %esi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: addl %ebp, %ebp +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi +; FALLBACK16-NEXT: movl %edi, %ebx +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %ebx, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movb %dh, %cl +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl %ebx, 28(%eax) +; FALLBACK16-NEXT: movl %esi, 24(%eax) +; FALLBACK16-NEXT: movl %edi, 16(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 20(%eax) +; FALLBACK16-NEXT: movl %ebp, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 12(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, (%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 4(%eax) +; FALLBACK16-NEXT: addl $108, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: lshr_32bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $92, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl (%ebp), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%ebp), %eax +; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 8(%ebp), %esi +; FALLBACK17-NEXT: movl 12(%ebp), %edi +; FALLBACK17-NEXT: movl 16(%ebp), %ebx +; FALLBACK17-NEXT: movb (%ecx), %ch +; FALLBACK17-NEXT: movl 20(%ebp), %edx +; FALLBACK17-NEXT: movl 24(%ebp), %eax +; FALLBACK17-NEXT: movl 28(%ebp), %ebp +; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: shlb $3, %cl +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: andb $28, %ch +; FALLBACK17-NEXT: movzbl %ch, %ebp +; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %edx, %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx +; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edi +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl %edx, 24(%ebp) +; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: shrl %cl, %eax +; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl %ebx, 16(%ebp) +; FALLBACK17-NEXT: movl %edi, 20(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ebp) +; FALLBACK17-NEXT: movl %esi, (%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: addl $92, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: lshr_32bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $108, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%eax), %esi +; FALLBACK18-NEXT: movl 12(%eax), %edi +; FALLBACK18-NEXT: movl 16(%eax), %ebp +; FALLBACK18-NEXT: movzbl (%ebx), %ebx +; FALLBACK18-NEXT: movl 20(%eax), %edx +; FALLBACK18-NEXT: movl 24(%eax), %ecx +; FALLBACK18-NEXT: movl 28(%eax), %eax +; FALLBACK18-NEXT: xorps %xmm0, %xmm0 +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: andb $28, %bl +; FALLBACK18-NEXT: movzbl %bl, %edi +; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %esi, %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %eax, %edx +; FALLBACK18-NEXT: movl %eax, %ebx +; FALLBACK18-NEXT: notb %dl +; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi +; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx +; FALLBACK18-NEXT: orl %ebx, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %eax, %ebx +; FALLBACK18-NEXT: addl %ebp, %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK18-NEXT: orl %esi, %ecx +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %eax, %eax +; FALLBACK18-NEXT: shlxl %edx, %eax, %esi +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %eax, %edi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl %ebx, 28(%eax) +; FALLBACK18-NEXT: movl %edi, 24(%eax) +; FALLBACK18-NEXT: movl %esi, 16(%eax) +; FALLBACK18-NEXT: movl %ecx, 20(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 8(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl $108, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: lshr_32bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $92, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl (%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%ecx), %esi +; FALLBACK19-NEXT: movl 12(%ecx), %edi +; FALLBACK19-NEXT: movl 16(%ecx), %ebp +; FALLBACK19-NEXT: movzbl (%ebx), %ebx +; FALLBACK19-NEXT: movl 20(%ecx), %edx +; FALLBACK19-NEXT: movl 24(%ecx), %eax +; FALLBACK19-NEXT: movl 28(%ecx), %ecx +; FALLBACK19-NEXT: xorps %xmm0, %xmm0 +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, %ecx +; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: andb $28, %bl +; FALLBACK19-NEXT: movzbl %bl, %ebp +; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %esi, %eax +; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx +; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx +; FALLBACK19-NEXT: movl %edx, %esi +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx +; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi +; FALLBACK19-NEXT: shrdl %cl, %edi, %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl %eax, 24(%ebp) +; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax +; FALLBACK19-NEXT: movl %eax, 28(%ebp) +; FALLBACK19-NEXT: movl %ebx, 16(%ebp) +; FALLBACK19-NEXT: movl %esi, 20(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 8(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: movl %edx, (%ebp) +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 4(%ebp) +; FALLBACK19-NEXT: addl $92, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: lshr_32bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $108, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK20-NEXT: movzbl (%eax), %ecx +; FALLBACK20-NEXT: movl %ecx, %eax +; FALLBACK20-NEXT: shlb $3, %al +; FALLBACK20-NEXT: xorps %xmm2, %xmm2 +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: andb $28, %cl +; FALLBACK20-NEXT: movzbl %cl, %ecx +; FALLBACK20-NEXT: movl 32(%esp,%ecx), %esi +; FALLBACK20-NEXT: movl 36(%esp,%ecx), %ebx +; FALLBACK20-NEXT: movl %ecx, %edi +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %esi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %esi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK20-NEXT: movl %esi, %ebx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: addl %ebp, %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %ebx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx +; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %ebp, %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl %ebx, 28(%eax) +; FALLBACK20-NEXT: movl %esi, 4(%eax) +; FALLBACK20-NEXT: movl %edi, 24(%eax) +; FALLBACK20-NEXT: movl %ebp, 16(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 20(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 8(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 12(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, (%eax) +; FALLBACK20-NEXT: addl $108, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: lshr_32bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $108, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK21-NEXT: movzbl (%eax), %eax +; FALLBACK21-NEXT: movl %eax, %ecx +; FALLBACK21-NEXT: shlb $3, %cl +; FALLBACK21-NEXT: xorps %xmm2, %xmm2 +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: andb $28, %al +; FALLBACK21-NEXT: movzbl %al, %ebp +; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx +; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl %edi, %esi +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %esi, 4(%ebp) +; FALLBACK21-NEXT: movl %ebx, 24(%ebp) +; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: shrl %cl, %eax +; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 12(%ebp) +; FALLBACK21-NEXT: movl %edx, (%ebp) +; FALLBACK21-NEXT: addl $108, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: lshr_32bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $108, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK22-NEXT: movzbl (%eax), %ecx +; FALLBACK22-NEXT: movl %ecx, %edx +; FALLBACK22-NEXT: shlb $3, %dl +; FALLBACK22-NEXT: xorps %xmm2, %xmm2 +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: andb $28, %cl +; FALLBACK22-NEXT: movzbl %cl, %edi +; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx +; FALLBACK22-NEXT: movl %edx, %eax +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %eax, %esi, %esi +; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %ecx, %ecx +; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK22-NEXT: movl %eax, %ebp +; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx +; FALLBACK22-NEXT: orl %ebx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %ecx, %ecx +; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx +; FALLBACK22-NEXT: orl %ebx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebx +; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax +; FALLBACK22-NEXT: movl %ebp, %ecx +; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK22-NEXT: orl %ebp, %ebx +; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %eax +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: movl %ecx, %edx +; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: addl %ecx, %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK22-NEXT: orl %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %eax, 28(%edx) +; FALLBACK22-NEXT: movl %ecx, 4(%edx) +; FALLBACK22-NEXT: movl %edi, 24(%edx) +; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, (%edx) +; FALLBACK22-NEXT: addl $108, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: lshr_32bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $108, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK23-NEXT: movzbl (%eax), %eax +; FALLBACK23-NEXT: movl %eax, %ecx +; FALLBACK23-NEXT: shlb $3, %cl +; FALLBACK23-NEXT: xorps %xmm2, %xmm2 +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: andb $28, %al +; FALLBACK23-NEXT: movzbl %al, %ebx +; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, %edi +; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx +; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl %ebx, 4(%eax) +; FALLBACK23-NEXT: movl %ebp, 24(%eax) +; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK23-NEXT: movl %ebx, 28(%eax) +; FALLBACK23-NEXT: movl %esi, 16(%eax) +; FALLBACK23-NEXT: movl %edi, 20(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: movl %esi, 8(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: movl %esi, 12(%eax) +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, (%eax) +; FALLBACK23-NEXT: addl $108, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: lshr_32bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $108, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK24-NEXT: movzbl (%eax), %ecx +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: shlb $3, %al +; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: andb $28, %cl +; FALLBACK24-NEXT: movzbl %cl, %ecx +; FALLBACK24-NEXT: movl 32(%esp,%ecx), %esi +; FALLBACK24-NEXT: movl 36(%esp,%ecx), %ebx +; FALLBACK24-NEXT: movl %ecx, %edi +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %esi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %esi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK24-NEXT: movl %esi, %ebx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: addl %ebp, %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %ebx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx +; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %ebp, %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl %ebx, 28(%eax) +; FALLBACK24-NEXT: movl %esi, 4(%eax) +; FALLBACK24-NEXT: movl %edi, 24(%eax) +; FALLBACK24-NEXT: movl %ebp, 16(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 20(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 8(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 12(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, (%eax) +; FALLBACK24-NEXT: addl $108, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: vzeroupper +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: lshr_32bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $108, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK25-NEXT: movzbl (%eax), %eax +; FALLBACK25-NEXT: movl %eax, %ecx +; FALLBACK25-NEXT: shlb $3, %cl +; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: andb $28, %al +; FALLBACK25-NEXT: movzbl %al, %ebp +; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx +; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl %edi, %esi +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %esi, 4(%ebp) +; FALLBACK25-NEXT: movl %ebx, 24(%ebp) +; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: shrl %cl, %eax +; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 12(%ebp) +; FALLBACK25-NEXT: movl %edx, (%ebp) +; FALLBACK25-NEXT: addl $108, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: vzeroupper +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: lshr_32bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $108, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK26-NEXT: movzbl (%eax), %ecx +; FALLBACK26-NEXT: movl %ecx, %edx +; FALLBACK26-NEXT: shlb $3, %dl +; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: andb $28, %cl +; FALLBACK26-NEXT: movzbl %cl, %edi +; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx +; FALLBACK26-NEXT: movl %edx, %eax +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %eax, %esi, %esi +; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK26-NEXT: movl %eax, %ebp +; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx +; FALLBACK26-NEXT: orl %ebx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx +; FALLBACK26-NEXT: orl %ebx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebx +; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax +; FALLBACK26-NEXT: movl %ebp, %ecx +; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK26-NEXT: orl %ebp, %ebx +; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %eax +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: movl %ecx, %edx +; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %eax, 28(%edx) +; FALLBACK26-NEXT: movl %ecx, 4(%edx) +; FALLBACK26-NEXT: movl %edi, 24(%edx) +; FALLBACK26-NEXT: movl %ebx, 16(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, (%edx) +; FALLBACK26-NEXT: addl $108, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: vzeroupper +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: lshr_32bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $108, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK27-NEXT: movzbl (%eax), %eax +; FALLBACK27-NEXT: movl %eax, %ecx +; FALLBACK27-NEXT: shlb $3, %cl +; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: andb $28, %al +; FALLBACK27-NEXT: movzbl %al, %ebx +; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, %edi +; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx +; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl %ebx, 4(%eax) +; FALLBACK27-NEXT: movl %ebp, 24(%eax) +; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK27-NEXT: movl %ebx, 28(%eax) +; FALLBACK27-NEXT: movl %esi, 16(%eax) +; FALLBACK27-NEXT: movl %edi, 20(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: movl %esi, 8(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: movl %esi, 12(%eax) +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, (%eax) +; FALLBACK27-NEXT: addl $108, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: vzeroupper +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: lshr_32bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $108, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK28-NEXT: movzbl (%eax), %ecx +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: shlb $3, %al +; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: andb $28, %cl +; FALLBACK28-NEXT: movzbl %cl, %ecx +; FALLBACK28-NEXT: movl 32(%esp,%ecx), %esi +; FALLBACK28-NEXT: movl 36(%esp,%ecx), %ebx +; FALLBACK28-NEXT: movl %ecx, %edi +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %esi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %esi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK28-NEXT: movl %esi, %ebx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: addl %ebp, %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %ebx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx +; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %ebp, %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl %ebx, 28(%eax) +; FALLBACK28-NEXT: movl %esi, 4(%eax) +; FALLBACK28-NEXT: movl %edi, 24(%eax) +; FALLBACK28-NEXT: movl %ebp, 16(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 20(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 8(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 12(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, (%eax) +; FALLBACK28-NEXT: addl $108, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: vzeroupper +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: lshr_32bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $108, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK29-NEXT: movzbl (%eax), %eax +; FALLBACK29-NEXT: movl %eax, %ecx +; FALLBACK29-NEXT: shlb $3, %cl +; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: andb $28, %al +; FALLBACK29-NEXT: movzbl %al, %ebp +; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx +; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl %edi, %esi +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %esi, 4(%ebp) +; FALLBACK29-NEXT: movl %ebx, 24(%ebp) +; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: shrl %cl, %eax +; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 12(%ebp) +; FALLBACK29-NEXT: movl %edx, (%ebp) +; FALLBACK29-NEXT: addl $108, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: vzeroupper +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: lshr_32bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $108, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK30-NEXT: movzbl (%eax), %ecx +; FALLBACK30-NEXT: movl %ecx, %edx +; FALLBACK30-NEXT: shlb $3, %dl +; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: andb $28, %cl +; FALLBACK30-NEXT: movzbl %cl, %edi +; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx +; FALLBACK30-NEXT: movl %edx, %eax +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %eax, %esi, %esi +; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %ecx, %ecx +; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK30-NEXT: movl %eax, %ebp +; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx +; FALLBACK30-NEXT: orl %ebx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %ecx, %ecx +; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx +; FALLBACK30-NEXT: orl %ebx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebx +; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax +; FALLBACK30-NEXT: movl %ebp, %ecx +; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK30-NEXT: orl %ebp, %ebx +; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK30-NEXT: shrxl %edx, %edi, %eax +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: movl %ecx, %edx +; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: addl %ecx, %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK30-NEXT: orl %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %eax, 28(%edx) +; FALLBACK30-NEXT: movl %ecx, 4(%edx) +; FALLBACK30-NEXT: movl %edi, 24(%edx) +; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, (%edx) +; FALLBACK30-NEXT: addl $108, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: vzeroupper +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: lshr_32bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $108, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK31-NEXT: movzbl (%eax), %eax +; FALLBACK31-NEXT: movl %eax, %ecx +; FALLBACK31-NEXT: shlb $3, %cl +; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: andb $28, %al +; FALLBACK31-NEXT: movzbl %al, %ebx +; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, %edi +; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx +; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl %ebx, 4(%eax) +; FALLBACK31-NEXT: movl %ebp, 24(%eax) +; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK31-NEXT: movl %ebx, 28(%eax) +; FALLBACK31-NEXT: movl %esi, 16(%eax) +; FALLBACK31-NEXT: movl %edi, 20(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: movl %esi, 8(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: movl %esi, 12(%eax) +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, (%eax) +; FALLBACK31-NEXT: addl $108, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: vzeroupper +; FALLBACK31-NEXT: retl + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; FALLBACK0-LABEL: lshr_32bytes_dwordOff: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rcx +; FALLBACK0-NEXT: movq 8(%rdi), %r8 +; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movq 24(%rdi), %rdi +; FALLBACK0-NEXT: movzbl (%rsi), %esi +; FALLBACK0-NEXT: movl %esi, %eax +; FALLBACK0-NEXT: shlb $5, %al +; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: andb $6, %sil +; FALLBACK0-NEXT: movzbl %sil, %r9d +; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi +; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx +; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq %r11, %r8 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %rdi +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, 24(%rdx) +; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %rdi, (%rdx) +; FALLBACK0-NEXT: movq %r8, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: lshr_32bytes_dwordOff: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %rdi +; FALLBACK1-NEXT: movzbl (%rsi), %esi +; FALLBACK1-NEXT: movl %esi, %ecx +; FALLBACK1-NEXT: shlb $5, %cl +; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: andb $6, %sil +; FALLBACK1-NEXT: movzbl %sil, %eax +; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi +; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi +; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK1-NEXT: movq %r8, %r9 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK1-NEXT: shrq %cl, %rax +; FALLBACK1-NEXT: movq %rsi, 16(%rdx) +; FALLBACK1-NEXT: movq %rax, 24(%rdx) +; FALLBACK1-NEXT: movq %rdi, (%rdx) +; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: lshr_32bytes_dwordOff: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %rdi +; FALLBACK2-NEXT: movzbl (%rsi), %esi +; FALLBACK2-NEXT: movl %esi, %eax +; FALLBACK2-NEXT: shlb $5, %al +; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: andb $6, %sil +; FALLBACK2-NEXT: movzbl %sil, %ecx +; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi +; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: addq %rsi, %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: addq %rcx, %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: orq %r10, %rax +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %rax, 16(%rdx) +; FALLBACK2-NEXT: movq %rsi, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: lshr_32bytes_dwordOff: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %rdi +; FALLBACK3-NEXT: movzbl (%rsi), %esi +; FALLBACK3-NEXT: movl %esi, %ecx +; FALLBACK3-NEXT: shlb $5, %cl +; FALLBACK3-NEXT: xorps %xmm0, %xmm0 +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: andb $6, %sil +; FALLBACK3-NEXT: movzbl %sil, %eax +; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi +; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi +; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK3-NEXT: movq %r8, %r9 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax +; FALLBACK3-NEXT: movq %rsi, 16(%rdx) +; FALLBACK3-NEXT: movq %rax, 24(%rdx) +; FALLBACK3-NEXT: movq %rdi, (%rdx) +; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: lshr_32bytes_dwordOff: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movzbl (%rsi), %ecx +; FALLBACK4-NEXT: movl %ecx, %eax +; FALLBACK4-NEXT: shlb $5, %al +; FALLBACK4-NEXT: xorps %xmm2, %xmm2 +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: andb $6, %cl +; FALLBACK4-NEXT: movzbl %cl, %r9d +; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rdi +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 +; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r8 +; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, 24(%rdx) +; FALLBACK4-NEXT: movq %r10, 8(%rdx) +; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %rdi, (%rdx) +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: lshr_32bytes_dwordOff: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movzbl (%rsi), %eax +; FALLBACK5-NEXT: movl %eax, %ecx +; FALLBACK5-NEXT: shlb $5, %cl +; FALLBACK5-NEXT: xorps %xmm2, %xmm2 +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: andb $6, %al +; FALLBACK5-NEXT: movzbl %al, %eax +; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK5-NEXT: movq %rdi, %r8 +; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: shrq %cl, %rsi +; FALLBACK5-NEXT: movq %r10, 8(%rdx) +; FALLBACK5-NEXT: movq %r8, 16(%rdx) +; FALLBACK5-NEXT: movq %rsi, 24(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: lshr_32bytes_dwordOff: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movzbl (%rsi), %ecx +; FALLBACK6-NEXT: movl %ecx, %eax +; FALLBACK6-NEXT: shlb $5, %al +; FALLBACK6-NEXT: xorps %xmm2, %xmm2 +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: andb $6, %cl +; FALLBACK6-NEXT: movzbl %cl, %ecx +; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: addq %rdi, %rdi +; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK6-NEXT: orq %rsi, %rdi +; FALLBACK6-NEXT: addq %rcx, %rcx +; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK6-NEXT: orq %r9, %rcx +; FALLBACK6-NEXT: addq %r8, %r8 +; FALLBACK6-NEXT: shlxq %rax, %r8, %rax +; FALLBACK6-NEXT: orq %r10, %rax +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %rax, 8(%rdx) +; FALLBACK6-NEXT: movq %rcx, 16(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: lshr_32bytes_dwordOff: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movzbl (%rsi), %eax +; FALLBACK7-NEXT: movl %eax, %ecx +; FALLBACK7-NEXT: shlb $5, %cl +; FALLBACK7-NEXT: xorps %xmm2, %xmm2 +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: andb $6, %al +; FALLBACK7-NEXT: movzbl %al, %eax +; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK7-NEXT: movq %rdi, %r8 +; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq %r10, 8(%rdx) +; FALLBACK7-NEXT: movq %r8, 16(%rdx) +; FALLBACK7-NEXT: movq %rax, 24(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: lshr_32bytes_dwordOff: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: movzbl (%rsi), %ecx +; FALLBACK8-NEXT: movl %ecx, %eax +; FALLBACK8-NEXT: shlb $5, %al +; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: andb $6, %cl +; FALLBACK8-NEXT: movzbl %cl, %r9d +; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rdi +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 +; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, 24(%rdx) +; FALLBACK8-NEXT: movq %r10, 8(%rdx) +; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %rdi, (%rdx) +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: lshr_32bytes_dwordOff: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: movzbl (%rsi), %eax +; FALLBACK9-NEXT: movl %eax, %ecx +; FALLBACK9-NEXT: shlb $5, %cl +; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: andb $6, %al +; FALLBACK9-NEXT: movzbl %al, %eax +; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK9-NEXT: movq %rdi, %r8 +; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: shrq %cl, %rsi +; FALLBACK9-NEXT: movq %r10, 8(%rdx) +; FALLBACK9-NEXT: movq %r8, 16(%rdx) +; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: lshr_32bytes_dwordOff: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: movzbl (%rsi), %ecx +; FALLBACK10-NEXT: movl %ecx, %eax +; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: andb $6, %cl +; FALLBACK10-NEXT: movzbl %cl, %ecx +; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: addq %rdi, %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %rsi, %rdi +; FALLBACK10-NEXT: addq %rcx, %rcx +; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK10-NEXT: orq %r9, %rcx +; FALLBACK10-NEXT: addq %r8, %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %rcx, 16(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: lshr_32bytes_dwordOff: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: movzbl (%rsi), %eax +; FALLBACK11-NEXT: movl %eax, %ecx +; FALLBACK11-NEXT: shlb $5, %cl +; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: andb $6, %al +; FALLBACK11-NEXT: movzbl %al, %eax +; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK11-NEXT: movq %rdi, %r8 +; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq %r10, 8(%rdx) +; FALLBACK11-NEXT: movq %r8, 16(%rdx) +; FALLBACK11-NEXT: movq %rax, 24(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: lshr_32bytes_dwordOff: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK12-NEXT: movzbl (%rsi), %ecx +; FALLBACK12-NEXT: movl %ecx, %eax +; FALLBACK12-NEXT: shlb $5, %al +; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: andb $6, %cl +; FALLBACK12-NEXT: movzbl %cl, %r9d +; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rdi +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 +; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, 24(%rdx) +; FALLBACK12-NEXT: movq %r10, 8(%rdx) +; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %rdi, (%rdx) +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: lshr_32bytes_dwordOff: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK13-NEXT: movzbl (%rsi), %eax +; FALLBACK13-NEXT: movl %eax, %ecx +; FALLBACK13-NEXT: shlb $5, %cl +; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: andb $6, %al +; FALLBACK13-NEXT: movzbl %al, %eax +; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK13-NEXT: movq %rdi, %r8 +; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: shrq %cl, %rsi +; FALLBACK13-NEXT: movq %r10, 8(%rdx) +; FALLBACK13-NEXT: movq %r8, 16(%rdx) +; FALLBACK13-NEXT: movq %rsi, 24(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: lshr_32bytes_dwordOff: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK14-NEXT: movzbl (%rsi), %ecx +; FALLBACK14-NEXT: movl %ecx, %eax +; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: andb $6, %cl +; FALLBACK14-NEXT: movzbl %cl, %ecx +; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 +; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: addq %rdi, %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %rsi, %rdi +; FALLBACK14-NEXT: addq %rcx, %rcx +; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK14-NEXT: orq %r9, %rcx +; FALLBACK14-NEXT: addq %r8, %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %rax +; FALLBACK14-NEXT: orq %r10, %rax +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rcx, 16(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: lshr_32bytes_dwordOff: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK15-NEXT: movzbl (%rsi), %eax +; FALLBACK15-NEXT: movl %eax, %ecx +; FALLBACK15-NEXT: shlb $5, %cl +; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: andb $6, %al +; FALLBACK15-NEXT: movzbl %al, %eax +; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK15-NEXT: movq %rdi, %r8 +; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq %r10, 8(%rdx) +; FALLBACK15-NEXT: movq %r8, 16(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; X86-SSE2-LABEL: lshr_32bytes_dwordOff: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: pushl %edi +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl (%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 4(%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 8(%eax), %esi +; X86-SSE2-NEXT: movl 12(%eax), %edi +; X86-SSE2-NEXT: movl 16(%eax), %ebx +; X86-SSE2-NEXT: movl 20(%eax), %ebp +; X86-SSE2-NEXT: movl 24(%eax), %edx +; X86-SSE2-NEXT: movl 28(%eax), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzbl (%eax), %eax +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: andl $7, %eax +; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi +; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi +; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx +; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp +; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx +; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %edx, 28(%eax) +; X86-SSE2-NEXT: movl %ebp, 16(%eax) +; X86-SSE2-NEXT: movl %ebx, 20(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) +; X86-SSE2-NEXT: movl %esi, 12(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: addl $92, %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %edi +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: lshr_32bytes_dwordOff: +; X86-SSE42: # %bb.0: +; X86-SSE42-NEXT: subl $76, %esp +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE42-NEXT: movups (%edx), %xmm0 +; X86-SSE42-NEXT: movups 16(%edx), %xmm1 +; X86-SSE42-NEXT: movzbl (%ecx), %ecx +; X86-SSE42-NEXT: xorps %xmm2, %xmm2 +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) +; X86-SSE42-NEXT: andl $7, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 +; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1 +; X86-SSE42-NEXT: movups %xmm1, 16(%eax) +; X86-SSE42-NEXT: movups %xmm0, (%eax) +; X86-SSE42-NEXT: addl $76, %esp +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: lshr_32bytes_dwordOff: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: subl $76, %esp +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: vmovups (%edx), %ymm0 +; X86-AVX-NEXT: movzbl (%ecx), %ecx +; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovups %ymm0, (%esp) +; X86-AVX-NEXT: andl $7, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 +; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1 +; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: addl $76, %esp +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 5 + %res = lshr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { +; X64-SSE2-LABEL: lshr_32bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi ; X64-SSE2-NEXT: movzbl (%rsi), %esi +; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $31, %esi -; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax -; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: andl $3, %esi +; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax +; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx +; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi +; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: retq ; -; X64-SSE42-LABEL: lshr_32bytes: +; X64-SSE42-LABEL: lshr_32bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movzbl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm2, %xmm2 -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: andl $31, %eax -; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0 -; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1 +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: andl $3, %eax +; X64-SSE42-NEXT: movups -72(%rsp,%rax,8), %xmm0 +; X64-SSE42-NEXT: movups -56(%rsp,%rax,8), %xmm1 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: lshr_32bytes: +; X64-AVX-LABEL: lshr_32bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: movzbl (%rsi), %eax ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $31, %eax -; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1 +; X64-AVX-NEXT: andl $3, %eax +; X64-AVX-NEXT: vmovups -72(%rsp,%rax,8), %xmm0 +; X64-AVX-NEXT: vmovups -56(%rsp,%rax,8), %xmm1 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; -; X86-SSE2-LABEL: lshr_32bytes: +; X86-SSE2-LABEL: lshr_32bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $72, %esp +; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %esi ; X86-SSE2-NEXT: movl 12(%eax), %edi ; X86-SSE2-NEXT: movl 16(%eax), %ebx @@ -1148,35 +5833,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movzbl (%eax), %eax +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $31, %eax -; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx +; X86-SSE2-NEXT: andl $3, %eax +; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi +; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi +; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx +; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp +; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx +; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl %edx, 28(%eax) @@ -1186,18 +5866,18 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $72, %esp +; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: lshr_32bytes: +; X86-SSE42-LABEL: lshr_32bytes_qwordOff: ; X86-SSE42: # %bb.0: -; X86-SSE42-NEXT: subl $64, %esp +; X86-SSE42-NEXT: subl $76, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1205,21 +5885,21 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm2, %xmm2 -; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, (%esp) -; X86-SSE42-NEXT: andl $31, %ecx -; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0 -; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1 +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) +; X86-SSE42-NEXT: andl $3, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) -; X86-SSE42-NEXT: addl $64, %esp +; X86-SSE42-NEXT: addl $76, %esp ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: lshr_32bytes: +; X86-AVX-LABEL: lshr_32bytes_qwordOff: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $64, %esp +; X86-AVX-NEXT: subl $76, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1228,137 +5908,2830 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) -; X86-AVX-NEXT: andl $31, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 +; X86-AVX-NEXT: andl $3, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0 +; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) -; X86-AVX-NEXT: addl $64, %esp +; X86-AVX-NEXT: addl $76, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %qwordOff = load i256, ptr %qwordOff.ptr, align 1 + %bitOff = shl i256 %qwordOff, 6 %res = lshr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; X64-SSE2-LABEL: shl_32bytes: +; FALLBACK0-LABEL: shl_32bytes: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rcx +; FALLBACK0-NEXT: movq 8(%rdi), %r8 +; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movq 24(%rdi), %rdi +; FALLBACK0-NEXT: movzbl (%rsi), %esi +; FALLBACK0-NEXT: leal (,%rsi,8), %eax +; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: andb $24, %sil +; FALLBACK0-NEXT: negb %sil +; FALLBACK0-NEXT: movsbq %sil, %r10 +; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 +; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi +; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq %r8, %r9 +; FALLBACK0-NEXT: shrq %r9 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r9 +; FALLBACK0-NEXT: orq %r11, %r9 +; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 +; FALLBACK0-NEXT: movq %r10, %rbx +; FALLBACK0-NEXT: shrq %rbx +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: orq %r11, %rbx +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: shrq %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: movq %r8, (%rdx) +; FALLBACK0-NEXT: movq %rdi, 16(%rdx) +; FALLBACK0-NEXT: movq %rbx, 24(%rdx) +; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: shl_32bytes: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %rdi +; FALLBACK1-NEXT: movzbl (%rsi), %esi +; FALLBACK1-NEXT: leal (,%rsi,8), %ecx +; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: andb $24, %sil +; FALLBACK1-NEXT: negb %sil +; FALLBACK1-NEXT: movsbq %sil, %rax +; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK1-NEXT: shldq %cl, %rax, %rsi +; FALLBACK1-NEXT: shldq %cl, %r8, %rax +; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK1-NEXT: shlq %cl, %r8 +; FALLBACK1-NEXT: movq %rsi, 16(%rdx) +; FALLBACK1-NEXT: movq %rdi, 24(%rdx) +; FALLBACK1-NEXT: movq %r8, (%rdx) +; FALLBACK1-NEXT: movq %rax, 8(%rdx) +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: shl_32bytes: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %rdi +; FALLBACK2-NEXT: movzbl (%rsi), %esi +; FALLBACK2-NEXT: leal (,%rsi,8), %eax +; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: andb $24, %sil +; FALLBACK2-NEXT: negb %sil +; FALLBACK2-NEXT: movsbq %sil, %rsi +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 +; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 +; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 +; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shrq %rdi +; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: shrq %rsi +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: shrq %rcx +; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax +; FALLBACK2-NEXT: orq %r10, %rax +; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: movq %rax, 16(%rdx) +; FALLBACK2-NEXT: movq %rsi, 24(%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: shl_32bytes: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %rdi +; FALLBACK3-NEXT: movzbl (%rsi), %esi +; FALLBACK3-NEXT: leal (,%rsi,8), %ecx +; FALLBACK3-NEXT: xorps %xmm0, %xmm0 +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: andb $24, %sil +; FALLBACK3-NEXT: negb %sil +; FALLBACK3-NEXT: movsbq %sil, %rax +; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK3-NEXT: shldq %cl, %rax, %rsi +; FALLBACK3-NEXT: shldq %cl, %r8, %rax +; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx +; FALLBACK3-NEXT: movq %rsi, 16(%rdx) +; FALLBACK3-NEXT: movq %rdi, 24(%rdx) +; FALLBACK3-NEXT: movq %rcx, (%rdx) +; FALLBACK3-NEXT: movq %rax, 8(%rdx) +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: shl_32bytes: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movzbl (%rsi), %ecx +; FALLBACK4-NEXT: leal (,%rcx,8), %eax +; FALLBACK4-NEXT: xorps %xmm2, %xmm2 +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: andb $24, %cl +; FALLBACK4-NEXT: negb %cl +; FALLBACK4-NEXT: movsbq %cl, %r8 +; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK4-NEXT: movq %r10, %rdi +; FALLBACK4-NEXT: shrq %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %rdi +; FALLBACK4-NEXT: orq %r9, %rdi +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK4-NEXT: movq %r8, %r11 +; FALLBACK4-NEXT: shrq %r11 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: orq %r10, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r8 +; FALLBACK4-NEXT: movq %r9, %r10 +; FALLBACK4-NEXT: shrq %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, (%rdx) +; FALLBACK4-NEXT: movq %r10, 8(%rdx) +; FALLBACK4-NEXT: movq %r11, 16(%rdx) +; FALLBACK4-NEXT: movq %rdi, 24(%rdx) +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: shl_32bytes: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movzbl (%rsi), %eax +; FALLBACK5-NEXT: leal (,%rax,8), %ecx +; FALLBACK5-NEXT: xorps %xmm2, %xmm2 +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: andb $24, %al +; FALLBACK5-NEXT: negb %al +; FALLBACK5-NEXT: movsbq %al, %rax +; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK5-NEXT: shldq %cl, %rax, %rsi +; FALLBACK5-NEXT: movq %r8, %r9 +; FALLBACK5-NEXT: shlq %cl, %r9 +; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK5-NEXT: shldq %cl, %r8, %rax +; FALLBACK5-NEXT: movq %rax, 8(%rdx) +; FALLBACK5-NEXT: movq %rsi, 16(%rdx) +; FALLBACK5-NEXT: movq %rdi, 24(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: shl_32bytes: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movzbl (%rsi), %ecx +; FALLBACK6-NEXT: leal (,%rcx,8), %eax +; FALLBACK6-NEXT: xorps %xmm2, %xmm2 +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: andb $24, %cl +; FALLBACK6-NEXT: negb %cl +; FALLBACK6-NEXT: movsbq %cl, %rcx +; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 +; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: shrq %rdi +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK6-NEXT: orq %rsi, %rdi +; FALLBACK6-NEXT: shrq %rcx +; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK6-NEXT: orq %r8, %rcx +; FALLBACK6-NEXT: shrq %r9 +; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %r10, %rax +; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rax, 8(%rdx) +; FALLBACK6-NEXT: movq %rcx, 16(%rdx) +; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: shl_32bytes: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movzbl (%rsi), %eax +; FALLBACK7-NEXT: leal (,%rax,8), %ecx +; FALLBACK7-NEXT: xorps %xmm2, %xmm2 +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: andb $24, %al +; FALLBACK7-NEXT: negb %al +; FALLBACK7-NEXT: movsbq %al, %rax +; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK7-NEXT: shldq %cl, %rax, %rsi +; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK7-NEXT: shldq %cl, %r8, %rax +; FALLBACK7-NEXT: movq %rax, 8(%rdx) +; FALLBACK7-NEXT: movq %rsi, 16(%rdx) +; FALLBACK7-NEXT: movq %rdi, 24(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: shl_32bytes: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: movzbl (%rsi), %ecx +; FALLBACK8-NEXT: leal (,%rcx,8), %eax +; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: andb $24, %cl +; FALLBACK8-NEXT: negb %cl +; FALLBACK8-NEXT: movsbq %cl, %r8 +; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK8-NEXT: movq %r10, %rdi +; FALLBACK8-NEXT: shrq %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %rdi +; FALLBACK8-NEXT: orq %r9, %rdi +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK8-NEXT: movq %r8, %r11 +; FALLBACK8-NEXT: shrq %r11 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: orq %r10, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r8 +; FALLBACK8-NEXT: movq %r9, %r10 +; FALLBACK8-NEXT: shrq %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, (%rdx) +; FALLBACK8-NEXT: movq %r10, 8(%rdx) +; FALLBACK8-NEXT: movq %r11, 16(%rdx) +; FALLBACK8-NEXT: movq %rdi, 24(%rdx) +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: shl_32bytes: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: movzbl (%rsi), %eax +; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: andb $24, %al +; FALLBACK9-NEXT: negb %al +; FALLBACK9-NEXT: movsbq %al, %rax +; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK9-NEXT: shldq %cl, %rax, %rsi +; FALLBACK9-NEXT: movq %r8, %r9 +; FALLBACK9-NEXT: shlq %cl, %r9 +; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK9-NEXT: shldq %cl, %r8, %rax +; FALLBACK9-NEXT: movq %rax, 8(%rdx) +; FALLBACK9-NEXT: movq %rsi, 16(%rdx) +; FALLBACK9-NEXT: movq %rdi, 24(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: shl_32bytes: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: movzbl (%rsi), %ecx +; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: andb $24, %cl +; FALLBACK10-NEXT: negb %cl +; FALLBACK10-NEXT: movsbq %cl, %rcx +; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 +; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: shrq %rdi +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %rsi, %rdi +; FALLBACK10-NEXT: shrq %rcx +; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK10-NEXT: orq %r8, %rcx +; FALLBACK10-NEXT: shrq %r9 +; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %rcx, 16(%rdx) +; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: shl_32bytes: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: movzbl (%rsi), %eax +; FALLBACK11-NEXT: leal (,%rax,8), %ecx +; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: andb $24, %al +; FALLBACK11-NEXT: negb %al +; FALLBACK11-NEXT: movsbq %al, %rax +; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK11-NEXT: shldq %cl, %rax, %rsi +; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK11-NEXT: shldq %cl, %r8, %rax +; FALLBACK11-NEXT: movq %rax, 8(%rdx) +; FALLBACK11-NEXT: movq %rsi, 16(%rdx) +; FALLBACK11-NEXT: movq %rdi, 24(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: shl_32bytes: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK12-NEXT: movzbl (%rsi), %ecx +; FALLBACK12-NEXT: leal (,%rcx,8), %eax +; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: andb $24, %cl +; FALLBACK12-NEXT: negb %cl +; FALLBACK12-NEXT: movsbq %cl, %r8 +; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK12-NEXT: movq %r10, %rdi +; FALLBACK12-NEXT: shrq %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %rdi +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK12-NEXT: movq %r8, %r11 +; FALLBACK12-NEXT: shrq %r11 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: orq %r10, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: movq %r9, %r10 +; FALLBACK12-NEXT: shrq %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, (%rdx) +; FALLBACK12-NEXT: movq %r10, 8(%rdx) +; FALLBACK12-NEXT: movq %r11, 16(%rdx) +; FALLBACK12-NEXT: movq %rdi, 24(%rdx) +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: shl_32bytes: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK13-NEXT: movzbl (%rsi), %eax +; FALLBACK13-NEXT: leal (,%rax,8), %ecx +; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: andb $24, %al +; FALLBACK13-NEXT: negb %al +; FALLBACK13-NEXT: movsbq %al, %rax +; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK13-NEXT: shldq %cl, %rax, %rsi +; FALLBACK13-NEXT: movq %r8, %r9 +; FALLBACK13-NEXT: shlq %cl, %r9 +; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK13-NEXT: shldq %cl, %r8, %rax +; FALLBACK13-NEXT: movq %rax, 8(%rdx) +; FALLBACK13-NEXT: movq %rsi, 16(%rdx) +; FALLBACK13-NEXT: movq %rdi, 24(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: shl_32bytes: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK14-NEXT: movzbl (%rsi), %ecx +; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: andb $24, %cl +; FALLBACK14-NEXT: negb %cl +; FALLBACK14-NEXT: movsbq %cl, %rcx +; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 +; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: shrq %rdi +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %rsi, %rdi +; FALLBACK14-NEXT: shrq %rcx +; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK14-NEXT: orq %r8, %rcx +; FALLBACK14-NEXT: shrq %r9 +; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %r10, %rax +; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rcx, 16(%rdx) +; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: shl_32bytes: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK15-NEXT: movzbl (%rsi), %eax +; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: andb $24, %al +; FALLBACK15-NEXT: negb %al +; FALLBACK15-NEXT: movsbq %al, %rax +; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK15-NEXT: shldq %cl, %rax, %rsi +; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK15-NEXT: shldq %cl, %r8, %rax +; FALLBACK15-NEXT: movq %rax, 8(%rdx) +; FALLBACK15-NEXT: movq %rsi, 16(%rdx) +; FALLBACK15-NEXT: movq %rdi, 24(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; FALLBACK16-LABEL: shl_32bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $108, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movl (%ecx), %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%ecx), %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 8(%ecx), %esi +; FALLBACK16-NEXT: movl 12(%ecx), %edi +; FALLBACK16-NEXT: movl 16(%ecx), %ebx +; FALLBACK16-NEXT: movb (%eax), %ah +; FALLBACK16-NEXT: movl 20(%ecx), %ebp +; FALLBACK16-NEXT: movl 24(%ecx), %edx +; FALLBACK16-NEXT: movl 28(%ecx), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movb %ah, %ch +; FALLBACK16-NEXT: shlb $3, %ch +; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $28, %ah +; FALLBACK16-NEXT: negb %ah +; FALLBACK16-NEXT: movsbl %ah, %ebx +; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax +; FALLBACK16-NEXT: movl %eax, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movb %ch, %dl +; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %esi, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %ebp +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: shrl %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %edi, %ebp +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: orl %edi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %edi, %ebx +; FALLBACK16-NEXT: shrl %ebx +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: orl %eax, %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: orl %edi, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl %edx, (%eax) +; FALLBACK16-NEXT: movl %esi, 24(%eax) +; FALLBACK16-NEXT: movl %ebx, 28(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 16(%eax) +; FALLBACK16-NEXT: movl %ebp, 20(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 12(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 4(%eax) +; FALLBACK16-NEXT: addl $108, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: shl_32bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $92, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl (%eax), %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%eax), %edx +; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 8(%eax), %esi +; FALLBACK17-NEXT: movl 12(%eax), %edi +; FALLBACK17-NEXT: movl 16(%eax), %ebx +; FALLBACK17-NEXT: movb (%ecx), %ch +; FALLBACK17-NEXT: movl 20(%eax), %ebp +; FALLBACK17-NEXT: movl 24(%eax), %edx +; FALLBACK17-NEXT: movl 28(%eax), %eax +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movb %ch, %cl +; FALLBACK17-NEXT: shlb $3, %cl +; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: andb $28, %ch +; FALLBACK17-NEXT: negb %ch +; FALLBACK17-NEXT: movsbl %ch, %eax +; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx +; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx +; FALLBACK17-NEXT: movl %ebx, %esi +; FALLBACK17-NEXT: shldl %cl, %edx, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi +; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp +; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %edi, %ebp +; FALLBACK17-NEXT: shldl %cl, %ebx, %edi +; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx +; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx +; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi +; FALLBACK17-NEXT: shldl %cl, %edx, %esi +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: shldl %cl, %eax, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl %edx, 24(%eax) +; FALLBACK17-NEXT: movl %esi, 28(%eax) +; FALLBACK17-NEXT: movl %edi, 16(%eax) +; FALLBACK17-NEXT: movl %ebp, 20(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 8(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, 12(%eax) +; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK17-NEXT: shldl %cl, %ebx, %edx +; FALLBACK17-NEXT: shll %cl, %ebx +; FALLBACK17-NEXT: movl %ebx, (%eax) +; FALLBACK17-NEXT: movl %edx, 4(%eax) +; FALLBACK17-NEXT: addl $92, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: shl_32bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $108, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%eax), %esi +; FALLBACK18-NEXT: movl 12(%eax), %edi +; FALLBACK18-NEXT: movl 16(%eax), %ebp +; FALLBACK18-NEXT: movzbl (%ebx), %ebx +; FALLBACK18-NEXT: movl 20(%eax), %edx +; FALLBACK18-NEXT: movl 24(%eax), %ecx +; FALLBACK18-NEXT: movl 28(%eax), %eax +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebx, %edx +; FALLBACK18-NEXT: shlb $3, %dl +; FALLBACK18-NEXT: xorps %xmm0, %xmm0 +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: andb $28, %bl +; FALLBACK18-NEXT: negb %bl +; FALLBACK18-NEXT: movsbl %bl, %esi +; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %eax, %edi +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl %ebx, %edi +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax +; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK18-NEXT: orl %eax, %edi +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp +; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx +; FALLBACK18-NEXT: orl %eax, %edx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: movl %edx, 24(%eax) +; FALLBACK18-NEXT: movl %esi, 28(%eax) +; FALLBACK18-NEXT: movl %edi, 16(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 20(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 8(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl $108, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: shl_32bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $92, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl (%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%ecx), %esi +; FALLBACK19-NEXT: movl 12(%ecx), %edi +; FALLBACK19-NEXT: movl 16(%ecx), %ebp +; FALLBACK19-NEXT: movzbl (%ebx), %ebx +; FALLBACK19-NEXT: movl 20(%ecx), %edx +; FALLBACK19-NEXT: movl 24(%ecx), %eax +; FALLBACK19-NEXT: movl 28(%ecx), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, %ecx +; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: xorps %xmm0, %xmm0 +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: andb $28, %bl +; FALLBACK19-NEXT: negb %bl +; FALLBACK19-NEXT: movsbl %bl, %eax +; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx +; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi +; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %edx, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx +; FALLBACK19-NEXT: shldl %cl, %ebx, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi +; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp +; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %edi, %ebp +; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK19-NEXT: shldl %cl, %edx, %edi +; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx +; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx +; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi +; FALLBACK19-NEXT: shldl %cl, %edx, %esi +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: shldl %cl, %eax, %edx +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl %edx, 24(%eax) +; FALLBACK19-NEXT: movl %esi, 28(%eax) +; FALLBACK19-NEXT: movl %edi, 16(%eax) +; FALLBACK19-NEXT: movl %ebp, 20(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 8(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, 12(%eax) +; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload +; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx +; FALLBACK19-NEXT: movl %edx, (%eax) +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: shldl %cl, %esi, %ebx +; FALLBACK19-NEXT: movl %ebx, 4(%eax) +; FALLBACK19-NEXT: addl $92, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: shl_32bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $108, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK20-NEXT: movzbl (%eax), %ecx +; FALLBACK20-NEXT: movb %cl, %dh +; FALLBACK20-NEXT: shlb $3, %dh +; FALLBACK20-NEXT: xorps %xmm2, %xmm2 +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: andb $28, %cl +; FALLBACK20-NEXT: negb %cl +; FALLBACK20-NEXT: movsbl %cl, %eax +; FALLBACK20-NEXT: movl 84(%esp,%eax), %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movb %dh, %dl +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: movl 80(%esp,%eax), %esi +; FALLBACK20-NEXT: movl %eax, %ebx +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %edi, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl %ebx, %edi +; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, %eax +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %esi, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx +; FALLBACK20-NEXT: movl %ebx, %eax +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: shrl %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %edi +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %edi, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl %edx, (%eax) +; FALLBACK20-NEXT: movl %ebp, 28(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 24(%eax) +; FALLBACK20-NEXT: movl %ebx, 4(%eax) +; FALLBACK20-NEXT: movl %esi, 8(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 12(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 16(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 20(%eax) +; FALLBACK20-NEXT: addl $108, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: shl_32bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $92, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK21-NEXT: movzbl (%eax), %eax +; FALLBACK21-NEXT: movl %eax, %ecx +; FALLBACK21-NEXT: shlb $3, %cl +; FALLBACK21-NEXT: xorps %xmm2, %xmm2 +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: andb $28, %al +; FALLBACK21-NEXT: negb %al +; FALLBACK21-NEXT: movsbl %al, %ebp +; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx +; FALLBACK21-NEXT: shldl %cl, %edx, %eax +; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi +; FALLBACK21-NEXT: shldl %cl, %edi, %edx +; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx +; FALLBACK21-NEXT: shldl %cl, %ebx, %edi +; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl %edx, %eax +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK21-NEXT: shldl %cl, %esi, %eax +; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp +; FALLBACK21-NEXT: shldl %cl, %edx, %ebp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK21-NEXT: movl %ebp, 28(%edx) +; FALLBACK21-NEXT: movl %eax, 24(%edx) +; FALLBACK21-NEXT: movl %esi, %eax +; FALLBACK21-NEXT: shll %cl, %eax +; FALLBACK21-NEXT: shldl %cl, %esi, %ebx +; FALLBACK21-NEXT: movl %ebx, 4(%edx) +; FALLBACK21-NEXT: movl %edi, 8(%edx) +; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 12(%edx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 16(%edx) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 20(%edx) +; FALLBACK21-NEXT: movl %eax, (%edx) +; FALLBACK21-NEXT: addl $92, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: shl_32bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $108, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK22-NEXT: movzbl (%eax), %ecx +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: xorps %xmm2, %xmm2 +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: andb $28, %cl +; FALLBACK22-NEXT: negb %cl +; FALLBACK22-NEXT: movsbl %cl, %edx +; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK22-NEXT: shlxl %eax, %esi, %edi +; FALLBACK22-NEXT: movl %eax, %ebx +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx +; FALLBACK22-NEXT: movl %ecx, %esi +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi +; FALLBACK22-NEXT: movl %esi, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %ecx, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx +; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK22-NEXT: movl %esi, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp +; FALLBACK22-NEXT: orl %ecx, %ebp +; FALLBACK22-NEXT: shlxl %eax, %esi, %edi +; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK22-NEXT: movl %esi, %ecx +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: orl %edi, %ecx +; FALLBACK22-NEXT: shlxl %eax, %esi, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi +; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx +; FALLBACK22-NEXT: shlxl %eax, %edx, %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: shrl %edx +; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK22-NEXT: movl %edi, (%esi) +; FALLBACK22-NEXT: movl %edx, 28(%esi) +; FALLBACK22-NEXT: movl %eax, 24(%esi) +; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %ebp, 8(%esi) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%esi) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 16(%esi) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%esi) +; FALLBACK22-NEXT: addl $108, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: shl_32bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $92, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK23-NEXT: movzbl (%eax), %eax +; FALLBACK23-NEXT: movl %eax, %ecx +; FALLBACK23-NEXT: shlb $3, %cl +; FALLBACK23-NEXT: xorps %xmm2, %xmm2 +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: andb $28, %al +; FALLBACK23-NEXT: negb %al +; FALLBACK23-NEXT: movsbl %al, %ebx +; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %eax, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK23-NEXT: shldl %cl, %edx, %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi +; FALLBACK23-NEXT: shldl %cl, %edi, %edx +; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK23-NEXT: shldl %cl, %ebp, %edi +; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl %edx, %eax +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: shldl %cl, %esi, %eax +; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx +; FALLBACK23-NEXT: shldl %cl, %edx, %ebx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK23-NEXT: movl %ebx, 28(%edx) +; FALLBACK23-NEXT: movl %eax, 24(%edx) +; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: shldl %cl, %esi, %ebp +; FALLBACK23-NEXT: movl %ebp, 4(%edx) +; FALLBACK23-NEXT: movl %edi, 8(%edx) +; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 12(%edx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 16(%edx) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 20(%edx) +; FALLBACK23-NEXT: movl %eax, (%edx) +; FALLBACK23-NEXT: addl $92, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: shl_32bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $108, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK24-NEXT: movzbl (%eax), %ecx +; FALLBACK24-NEXT: movb %cl, %dh +; FALLBACK24-NEXT: shlb $3, %dh +; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: andb $28, %cl +; FALLBACK24-NEXT: negb %cl +; FALLBACK24-NEXT: movsbl %cl, %eax +; FALLBACK24-NEXT: movl 84(%esp,%eax), %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movb %dh, %dl +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: movl 80(%esp,%eax), %esi +; FALLBACK24-NEXT: movl %eax, %ebx +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %edi, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl %ebx, %edi +; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, %eax +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %esi, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx +; FALLBACK24-NEXT: movl %ebx, %eax +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: shrl %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %edi +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %edi, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl %edx, (%eax) +; FALLBACK24-NEXT: movl %ebp, 28(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 24(%eax) +; FALLBACK24-NEXT: movl %ebx, 4(%eax) +; FALLBACK24-NEXT: movl %esi, 8(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 12(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 16(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 20(%eax) +; FALLBACK24-NEXT: addl $108, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: vzeroupper +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: shl_32bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $92, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK25-NEXT: movzbl (%eax), %eax +; FALLBACK25-NEXT: movl %eax, %ecx +; FALLBACK25-NEXT: shlb $3, %cl +; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: andb $28, %al +; FALLBACK25-NEXT: negb %al +; FALLBACK25-NEXT: movsbl %al, %ebp +; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx +; FALLBACK25-NEXT: shldl %cl, %edx, %eax +; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi +; FALLBACK25-NEXT: shldl %cl, %edi, %edx +; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx +; FALLBACK25-NEXT: shldl %cl, %ebx, %edi +; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl %edx, %eax +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK25-NEXT: shldl %cl, %esi, %eax +; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp +; FALLBACK25-NEXT: shldl %cl, %edx, %ebp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK25-NEXT: movl %ebp, 28(%edx) +; FALLBACK25-NEXT: movl %eax, 24(%edx) +; FALLBACK25-NEXT: movl %esi, %eax +; FALLBACK25-NEXT: shll %cl, %eax +; FALLBACK25-NEXT: shldl %cl, %esi, %ebx +; FALLBACK25-NEXT: movl %ebx, 4(%edx) +; FALLBACK25-NEXT: movl %edi, 8(%edx) +; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 12(%edx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 16(%edx) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 20(%edx) +; FALLBACK25-NEXT: movl %eax, (%edx) +; FALLBACK25-NEXT: addl $92, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: vzeroupper +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: shl_32bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $108, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK26-NEXT: movzbl (%eax), %ecx +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: shlb $3, %al +; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: andb $28, %cl +; FALLBACK26-NEXT: negb %cl +; FALLBACK26-NEXT: movsbl %cl, %edx +; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK26-NEXT: shlxl %eax, %esi, %edi +; FALLBACK26-NEXT: movl %eax, %ebx +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx +; FALLBACK26-NEXT: movl %ecx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi +; FALLBACK26-NEXT: movl %esi, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %ecx, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx +; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK26-NEXT: movl %esi, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp +; FALLBACK26-NEXT: orl %ecx, %ebp +; FALLBACK26-NEXT: shlxl %eax, %esi, %edi +; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK26-NEXT: movl %esi, %ecx +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK26-NEXT: orl %edi, %ecx +; FALLBACK26-NEXT: shlxl %eax, %esi, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi +; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx +; FALLBACK26-NEXT: shlxl %eax, %edx, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: shrl %edx +; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK26-NEXT: movl %edi, (%esi) +; FALLBACK26-NEXT: movl %edx, 28(%esi) +; FALLBACK26-NEXT: movl %eax, 24(%esi) +; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %ebp, 8(%esi) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%esi) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 16(%esi) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%esi) +; FALLBACK26-NEXT: addl $108, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: vzeroupper +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: shl_32bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $92, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK27-NEXT: movzbl (%eax), %eax +; FALLBACK27-NEXT: movl %eax, %ecx +; FALLBACK27-NEXT: shlb $3, %cl +; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: andb $28, %al +; FALLBACK27-NEXT: negb %al +; FALLBACK27-NEXT: movsbl %al, %ebx +; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %eax, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK27-NEXT: shldl %cl, %edx, %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi +; FALLBACK27-NEXT: shldl %cl, %edi, %edx +; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK27-NEXT: shldl %cl, %ebp, %edi +; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl %edx, %eax +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: shldl %cl, %esi, %eax +; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx +; FALLBACK27-NEXT: shldl %cl, %edx, %ebx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK27-NEXT: movl %ebx, 28(%edx) +; FALLBACK27-NEXT: movl %eax, 24(%edx) +; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: shldl %cl, %esi, %ebp +; FALLBACK27-NEXT: movl %ebp, 4(%edx) +; FALLBACK27-NEXT: movl %edi, 8(%edx) +; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 12(%edx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 16(%edx) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 20(%edx) +; FALLBACK27-NEXT: movl %eax, (%edx) +; FALLBACK27-NEXT: addl $92, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: vzeroupper +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: shl_32bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $108, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK28-NEXT: movzbl (%eax), %ecx +; FALLBACK28-NEXT: movb %cl, %dh +; FALLBACK28-NEXT: shlb $3, %dh +; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: andb $28, %cl +; FALLBACK28-NEXT: negb %cl +; FALLBACK28-NEXT: movsbl %cl, %eax +; FALLBACK28-NEXT: movl 84(%esp,%eax), %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movb %dh, %dl +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: movl 80(%esp,%eax), %esi +; FALLBACK28-NEXT: movl %eax, %ebx +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %edi, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl %ebx, %edi +; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, %eax +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %esi, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx +; FALLBACK28-NEXT: movl %ebx, %eax +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: shrl %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %edi +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %edi, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl %edx, (%eax) +; FALLBACK28-NEXT: movl %ebp, 28(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 24(%eax) +; FALLBACK28-NEXT: movl %ebx, 4(%eax) +; FALLBACK28-NEXT: movl %esi, 8(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 12(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 16(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 20(%eax) +; FALLBACK28-NEXT: addl $108, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: vzeroupper +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: shl_32bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $92, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK29-NEXT: movzbl (%eax), %eax +; FALLBACK29-NEXT: movl %eax, %ecx +; FALLBACK29-NEXT: shlb $3, %cl +; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: andb $28, %al +; FALLBACK29-NEXT: negb %al +; FALLBACK29-NEXT: movsbl %al, %ebp +; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx +; FALLBACK29-NEXT: shldl %cl, %edx, %eax +; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi +; FALLBACK29-NEXT: shldl %cl, %edi, %edx +; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx +; FALLBACK29-NEXT: shldl %cl, %ebx, %edi +; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl %edx, %eax +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK29-NEXT: shldl %cl, %esi, %eax +; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp +; FALLBACK29-NEXT: shldl %cl, %edx, %ebp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK29-NEXT: movl %ebp, 28(%edx) +; FALLBACK29-NEXT: movl %eax, 24(%edx) +; FALLBACK29-NEXT: movl %esi, %eax +; FALLBACK29-NEXT: shll %cl, %eax +; FALLBACK29-NEXT: shldl %cl, %esi, %ebx +; FALLBACK29-NEXT: movl %ebx, 4(%edx) +; FALLBACK29-NEXT: movl %edi, 8(%edx) +; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 12(%edx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 16(%edx) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 20(%edx) +; FALLBACK29-NEXT: movl %eax, (%edx) +; FALLBACK29-NEXT: addl $92, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: vzeroupper +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: shl_32bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $108, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK30-NEXT: movzbl (%eax), %ecx +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: shlb $3, %al +; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: andb $28, %cl +; FALLBACK30-NEXT: negb %cl +; FALLBACK30-NEXT: movsbl %cl, %edx +; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK30-NEXT: shlxl %eax, %esi, %edi +; FALLBACK30-NEXT: movl %eax, %ebx +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, %esi +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx +; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %ecx, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx +; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp +; FALLBACK30-NEXT: orl %ecx, %ebp +; FALLBACK30-NEXT: shlxl %eax, %esi, %edi +; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, %ecx +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: orl %edi, %ecx +; FALLBACK30-NEXT: shlxl %eax, %esi, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi +; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx +; FALLBACK30-NEXT: shlxl %eax, %edx, %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: shrl %edx +; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK30-NEXT: movl %edi, (%esi) +; FALLBACK30-NEXT: movl %edx, 28(%esi) +; FALLBACK30-NEXT: movl %eax, 24(%esi) +; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %ebp, 8(%esi) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%esi) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 16(%esi) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%esi) +; FALLBACK30-NEXT: addl $108, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: vzeroupper +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: shl_32bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $92, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK31-NEXT: movzbl (%eax), %eax +; FALLBACK31-NEXT: movl %eax, %ecx +; FALLBACK31-NEXT: shlb $3, %cl +; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: andb $28, %al +; FALLBACK31-NEXT: negb %al +; FALLBACK31-NEXT: movsbl %al, %ebx +; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %eax, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx +; FALLBACK31-NEXT: shldl %cl, %edx, %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi +; FALLBACK31-NEXT: shldl %cl, %edi, %edx +; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp +; FALLBACK31-NEXT: shldl %cl, %ebp, %edi +; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl %edx, %eax +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: shldl %cl, %esi, %eax +; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx +; FALLBACK31-NEXT: shldl %cl, %edx, %ebx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK31-NEXT: movl %ebx, 28(%edx) +; FALLBACK31-NEXT: movl %eax, 24(%edx) +; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: shldl %cl, %esi, %ebp +; FALLBACK31-NEXT: movl %ebp, 4(%edx) +; FALLBACK31-NEXT: movl %edi, 8(%edx) +; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 12(%edx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 16(%edx) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 20(%edx) +; FALLBACK31-NEXT: movl %eax, (%edx) +; FALLBACK31-NEXT: addl $92, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: vzeroupper +; FALLBACK31-NEXT: retl + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; FALLBACK0-LABEL: shl_32bytes_dwordOff: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rcx +; FALLBACK0-NEXT: movq 8(%rdi), %r8 +; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movq 24(%rdi), %rdi +; FALLBACK0-NEXT: movzbl (%rsi), %esi +; FALLBACK0-NEXT: movl %esi, %eax +; FALLBACK0-NEXT: shlb $5, %al +; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: shlb $2, %sil +; FALLBACK0-NEXT: andb $24, %sil +; FALLBACK0-NEXT: negb %sil +; FALLBACK0-NEXT: movsbq %sil, %r10 +; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 +; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi +; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq %r8, %r9 +; FALLBACK0-NEXT: shrq %r9 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r9 +; FALLBACK0-NEXT: orq %r11, %r9 +; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 +; FALLBACK0-NEXT: movq %r10, %rbx +; FALLBACK0-NEXT: shrq %rbx +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: orq %r11, %rbx +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: shrq %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: movq %r8, (%rdx) +; FALLBACK0-NEXT: movq %rdi, 16(%rdx) +; FALLBACK0-NEXT: movq %rbx, 24(%rdx) +; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: shl_32bytes_dwordOff: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %rdi +; FALLBACK1-NEXT: movzbl (%rsi), %esi +; FALLBACK1-NEXT: movl %esi, %ecx +; FALLBACK1-NEXT: shlb $5, %cl +; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: shlb $2, %sil +; FALLBACK1-NEXT: andb $24, %sil +; FALLBACK1-NEXT: negb %sil +; FALLBACK1-NEXT: movsbq %sil, %rax +; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK1-NEXT: shldq %cl, %rax, %rsi +; FALLBACK1-NEXT: shldq %cl, %r8, %rax +; FALLBACK1-NEXT: shlq %cl, %r8 +; FALLBACK1-NEXT: movq %rsi, 16(%rdx) +; FALLBACK1-NEXT: movq %rdi, 24(%rdx) +; FALLBACK1-NEXT: movq %r8, (%rdx) +; FALLBACK1-NEXT: movq %rax, 8(%rdx) +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: shl_32bytes_dwordOff: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %rdi +; FALLBACK2-NEXT: movzbl (%rsi), %esi +; FALLBACK2-NEXT: movl %esi, %eax +; FALLBACK2-NEXT: shlb $5, %al +; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: shlb $2, %sil +; FALLBACK2-NEXT: andb $24, %sil +; FALLBACK2-NEXT: negb %sil +; FALLBACK2-NEXT: movsbq %sil, %rsi +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 +; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 +; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 +; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shrq %rdi +; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: shrq %rsi +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: shrq %rcx +; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax +; FALLBACK2-NEXT: orq %r10, %rax +; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: movq %rax, 16(%rdx) +; FALLBACK2-NEXT: movq %rsi, 24(%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: shl_32bytes_dwordOff: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %rdi +; FALLBACK3-NEXT: movzbl (%rsi), %esi +; FALLBACK3-NEXT: movl %esi, %ecx +; FALLBACK3-NEXT: shlb $5, %cl +; FALLBACK3-NEXT: xorps %xmm0, %xmm0 +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: shlb $2, %sil +; FALLBACK3-NEXT: andb $24, %sil +; FALLBACK3-NEXT: negb %sil +; FALLBACK3-NEXT: movsbq %sil, %rax +; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK3-NEXT: shldq %cl, %rax, %rsi +; FALLBACK3-NEXT: shldq %cl, %r8, %rax +; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx +; FALLBACK3-NEXT: movq %rsi, 16(%rdx) +; FALLBACK3-NEXT: movq %rdi, 24(%rdx) +; FALLBACK3-NEXT: movq %rcx, (%rdx) +; FALLBACK3-NEXT: movq %rax, 8(%rdx) +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: shl_32bytes_dwordOff: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movzbl (%rsi), %ecx +; FALLBACK4-NEXT: movl %ecx, %eax +; FALLBACK4-NEXT: shlb $5, %al +; FALLBACK4-NEXT: xorps %xmm2, %xmm2 +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: shlb $2, %cl +; FALLBACK4-NEXT: andb $24, %cl +; FALLBACK4-NEXT: negb %cl +; FALLBACK4-NEXT: movsbq %cl, %r8 +; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK4-NEXT: movq %r10, %rdi +; FALLBACK4-NEXT: shrq %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %rdi +; FALLBACK4-NEXT: orq %r9, %rdi +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK4-NEXT: movq %r8, %r11 +; FALLBACK4-NEXT: shrq %r11 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: orq %r10, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r8 +; FALLBACK4-NEXT: movq %r9, %r10 +; FALLBACK4-NEXT: shrq %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, (%rdx) +; FALLBACK4-NEXT: movq %r10, 8(%rdx) +; FALLBACK4-NEXT: movq %r11, 16(%rdx) +; FALLBACK4-NEXT: movq %rdi, 24(%rdx) +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: shl_32bytes_dwordOff: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movzbl (%rsi), %eax +; FALLBACK5-NEXT: movl %eax, %ecx +; FALLBACK5-NEXT: shlb $5, %cl +; FALLBACK5-NEXT: xorps %xmm2, %xmm2 +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: shlb $2, %al +; FALLBACK5-NEXT: andb $24, %al +; FALLBACK5-NEXT: negb %al +; FALLBACK5-NEXT: movsbq %al, %rax +; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK5-NEXT: shldq %cl, %rax, %rsi +; FALLBACK5-NEXT: movq %r8, %r9 +; FALLBACK5-NEXT: shlq %cl, %r9 +; FALLBACK5-NEXT: shldq %cl, %r8, %rax +; FALLBACK5-NEXT: movq %rax, 8(%rdx) +; FALLBACK5-NEXT: movq %rsi, 16(%rdx) +; FALLBACK5-NEXT: movq %rdi, 24(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: shl_32bytes_dwordOff: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movzbl (%rsi), %ecx +; FALLBACK6-NEXT: movl %ecx, %eax +; FALLBACK6-NEXT: shlb $5, %al +; FALLBACK6-NEXT: xorps %xmm2, %xmm2 +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: shlb $2, %cl +; FALLBACK6-NEXT: andb $24, %cl +; FALLBACK6-NEXT: negb %cl +; FALLBACK6-NEXT: movsbq %cl, %rcx +; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 +; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: shrq %rdi +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK6-NEXT: orq %rsi, %rdi +; FALLBACK6-NEXT: shrq %rcx +; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK6-NEXT: orq %r8, %rcx +; FALLBACK6-NEXT: shrq %r9 +; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %r10, %rax +; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rax, 8(%rdx) +; FALLBACK6-NEXT: movq %rcx, 16(%rdx) +; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: shl_32bytes_dwordOff: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movzbl (%rsi), %eax +; FALLBACK7-NEXT: movl %eax, %ecx +; FALLBACK7-NEXT: shlb $5, %cl +; FALLBACK7-NEXT: xorps %xmm2, %xmm2 +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: shlb $2, %al +; FALLBACK7-NEXT: andb $24, %al +; FALLBACK7-NEXT: negb %al +; FALLBACK7-NEXT: movsbq %al, %rax +; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK7-NEXT: shldq %cl, %rax, %rsi +; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK7-NEXT: shldq %cl, %r8, %rax +; FALLBACK7-NEXT: movq %rax, 8(%rdx) +; FALLBACK7-NEXT: movq %rsi, 16(%rdx) +; FALLBACK7-NEXT: movq %rdi, 24(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: shl_32bytes_dwordOff: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: movzbl (%rsi), %ecx +; FALLBACK8-NEXT: movl %ecx, %eax +; FALLBACK8-NEXT: shlb $5, %al +; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: shlb $2, %cl +; FALLBACK8-NEXT: andb $24, %cl +; FALLBACK8-NEXT: negb %cl +; FALLBACK8-NEXT: movsbq %cl, %r8 +; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK8-NEXT: movq %r10, %rdi +; FALLBACK8-NEXT: shrq %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %rdi +; FALLBACK8-NEXT: orq %r9, %rdi +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK8-NEXT: movq %r8, %r11 +; FALLBACK8-NEXT: shrq %r11 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: orq %r10, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r8 +; FALLBACK8-NEXT: movq %r9, %r10 +; FALLBACK8-NEXT: shrq %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, (%rdx) +; FALLBACK8-NEXT: movq %r10, 8(%rdx) +; FALLBACK8-NEXT: movq %r11, 16(%rdx) +; FALLBACK8-NEXT: movq %rdi, 24(%rdx) +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: shl_32bytes_dwordOff: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: movzbl (%rsi), %eax +; FALLBACK9-NEXT: movl %eax, %ecx +; FALLBACK9-NEXT: shlb $5, %cl +; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: shlb $2, %al +; FALLBACK9-NEXT: andb $24, %al +; FALLBACK9-NEXT: negb %al +; FALLBACK9-NEXT: movsbq %al, %rax +; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK9-NEXT: shldq %cl, %rax, %rsi +; FALLBACK9-NEXT: movq %r8, %r9 +; FALLBACK9-NEXT: shlq %cl, %r9 +; FALLBACK9-NEXT: shldq %cl, %r8, %rax +; FALLBACK9-NEXT: movq %rax, 8(%rdx) +; FALLBACK9-NEXT: movq %rsi, 16(%rdx) +; FALLBACK9-NEXT: movq %rdi, 24(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: shl_32bytes_dwordOff: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: movzbl (%rsi), %ecx +; FALLBACK10-NEXT: movl %ecx, %eax +; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: shlb $2, %cl +; FALLBACK10-NEXT: andb $24, %cl +; FALLBACK10-NEXT: negb %cl +; FALLBACK10-NEXT: movsbq %cl, %rcx +; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 +; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: shrq %rdi +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %rsi, %rdi +; FALLBACK10-NEXT: shrq %rcx +; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK10-NEXT: orq %r8, %rcx +; FALLBACK10-NEXT: shrq %r9 +; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %rcx, 16(%rdx) +; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: shl_32bytes_dwordOff: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: movzbl (%rsi), %eax +; FALLBACK11-NEXT: movl %eax, %ecx +; FALLBACK11-NEXT: shlb $5, %cl +; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: shlb $2, %al +; FALLBACK11-NEXT: andb $24, %al +; FALLBACK11-NEXT: negb %al +; FALLBACK11-NEXT: movsbq %al, %rax +; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK11-NEXT: shldq %cl, %rax, %rsi +; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK11-NEXT: shldq %cl, %r8, %rax +; FALLBACK11-NEXT: movq %rax, 8(%rdx) +; FALLBACK11-NEXT: movq %rsi, 16(%rdx) +; FALLBACK11-NEXT: movq %rdi, 24(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: shl_32bytes_dwordOff: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK12-NEXT: movzbl (%rsi), %ecx +; FALLBACK12-NEXT: movl %ecx, %eax +; FALLBACK12-NEXT: shlb $5, %al +; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: shlb $2, %cl +; FALLBACK12-NEXT: andb $24, %cl +; FALLBACK12-NEXT: negb %cl +; FALLBACK12-NEXT: movsbq %cl, %r8 +; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 +; FALLBACK12-NEXT: movq %r10, %rdi +; FALLBACK12-NEXT: shrq %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %rdi +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 +; FALLBACK12-NEXT: movq %r8, %r11 +; FALLBACK12-NEXT: shrq %r11 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: orq %r10, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: movq %r9, %r10 +; FALLBACK12-NEXT: shrq %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, (%rdx) +; FALLBACK12-NEXT: movq %r10, 8(%rdx) +; FALLBACK12-NEXT: movq %r11, 16(%rdx) +; FALLBACK12-NEXT: movq %rdi, 24(%rdx) +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: shl_32bytes_dwordOff: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK13-NEXT: movzbl (%rsi), %eax +; FALLBACK13-NEXT: movl %eax, %ecx +; FALLBACK13-NEXT: shlb $5, %cl +; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: shlb $2, %al +; FALLBACK13-NEXT: andb $24, %al +; FALLBACK13-NEXT: negb %al +; FALLBACK13-NEXT: movsbq %al, %rax +; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK13-NEXT: shldq %cl, %rax, %rsi +; FALLBACK13-NEXT: movq %r8, %r9 +; FALLBACK13-NEXT: shlq %cl, %r9 +; FALLBACK13-NEXT: shldq %cl, %r8, %rax +; FALLBACK13-NEXT: movq %rax, 8(%rdx) +; FALLBACK13-NEXT: movq %rsi, 16(%rdx) +; FALLBACK13-NEXT: movq %rdi, 24(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: shl_32bytes_dwordOff: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK14-NEXT: movzbl (%rsi), %ecx +; FALLBACK14-NEXT: movl %ecx, %eax +; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: shlb $2, %cl +; FALLBACK14-NEXT: andb $24, %cl +; FALLBACK14-NEXT: negb %cl +; FALLBACK14-NEXT: movsbq %cl, %rcx +; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi +; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 +; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx +; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 +; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 +; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: shrq %rdi +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %rsi, %rdi +; FALLBACK14-NEXT: shrq %rcx +; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx +; FALLBACK14-NEXT: orq %r8, %rcx +; FALLBACK14-NEXT: shrq %r9 +; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %r10, %rax +; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rcx, 16(%rdx) +; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: shl_32bytes_dwordOff: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK15-NEXT: movzbl (%rsi), %eax +; FALLBACK15-NEXT: movl %eax, %ecx +; FALLBACK15-NEXT: shlb $5, %cl +; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: shlb $2, %al +; FALLBACK15-NEXT: andb $24, %al +; FALLBACK15-NEXT: negb %al +; FALLBACK15-NEXT: movsbq %al, %rax +; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi +; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi +; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi +; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 +; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax +; FALLBACK15-NEXT: shldq %cl, %rax, %rsi +; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK15-NEXT: shldq %cl, %r8, %rax +; FALLBACK15-NEXT: movq %rax, 8(%rdx) +; FALLBACK15-NEXT: movq %rsi, 16(%rdx) +; FALLBACK15-NEXT: movq %rdi, 24(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; X86-SSE2-LABEL: shl_32bytes_dwordOff: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: pushl %edi +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SSE2-NEXT: movl (%ebp), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 4(%ebp), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 8(%ebp), %esi +; X86-SSE2-NEXT: movl 12(%ebp), %edi +; X86-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-SSE2-NEXT: movzbl (%ecx), %ecx +; X86-SSE2-NEXT: movl 20(%ebp), %edx +; X86-SSE2-NEXT: movl 24(%ebp), %eax +; X86-SSE2-NEXT: movl 28(%ebp), %ebp +; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: shlb $2, %cl +; X86-SSE2-NEXT: andb $28, %cl +; X86-SSE2-NEXT: negb %cl +; X86-SSE2-NEXT: movsbl %cl, %edx +; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi +; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi +; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx +; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp +; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx +; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-SSE2-NEXT: movl %ebp, 16(%eax) +; X86-SSE2-NEXT: movl %ebx, 20(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) +; X86-SSE2-NEXT: movl %esi, 12(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: addl $92, %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %edi +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: shl_32bytes_dwordOff: +; X86-SSE42: # %bb.0: +; X86-SSE42-NEXT: subl $76, %esp +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE42-NEXT: movups (%edx), %xmm0 +; X86-SSE42-NEXT: movups 16(%edx), %xmm1 +; X86-SSE42-NEXT: movzbl (%ecx), %ecx +; X86-SSE42-NEXT: xorps %xmm2, %xmm2 +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm2, (%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: shlb $2, %cl +; X86-SSE42-NEXT: andb $28, %cl +; X86-SSE42-NEXT: negb %cl +; X86-SSE42-NEXT: movsbl %cl, %ecx +; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0 +; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1 +; X86-SSE42-NEXT: movups %xmm1, 16(%eax) +; X86-SSE42-NEXT: movups %xmm0, (%eax) +; X86-SSE42-NEXT: addl $76, %esp +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: shl_32bytes_dwordOff: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: subl $76, %esp +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: vmovups (%edx), %ymm0 +; X86-AVX-NEXT: movzbl (%ecx), %ecx +; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vmovups %ymm1, (%esp) +; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: shlb $2, %cl +; X86-AVX-NEXT: andb $28, %cl +; X86-AVX-NEXT: negb %cl +; X86-AVX-NEXT: movsbl %cl, %ecx +; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 +; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 +; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: addl $76, %esp +; X86-AVX-NEXT: vzeroupper +; X86-AVX-NEXT: retl + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 5 + %res = shl i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { +; X64-SSE2-LABEL: shl_32bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi ; X64-SSE2-NEXT: movzbl (%rsi), %esi +; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andb $31, %sil +; X64-SSE2-NEXT: shlb $3, %sil +; X64-SSE2-NEXT: andb $24, %sil ; X64-SSE2-NEXT: negb %sil ; X64-SSE2-NEXT: movsbq %sil, %rax -; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx -; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rcx +; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rsi +; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rax ; X64-SSE2-NEXT: movq %rax, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) ; X64-SSE2-NEXT: retq ; -; X64-SSE42-LABEL: shl_32bytes: +; X64-SSE42-LABEL: shl_32bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movzbl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm2, %xmm2 -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: andb $31, %al +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: shlb $3, %al +; X64-SSE42-NEXT: andb $24, %al ; X64-SSE42-NEXT: negb %al ; X64-SSE42-NEXT: movsbq %al, %rax -; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm0 -; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm1 +; X64-SSE42-NEXT: movups -40(%rsp,%rax), %xmm0 +; X64-SSE42-NEXT: movups -24(%rsp,%rax), %xmm1 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: shl_32bytes: +; X64-AVX-LABEL: shl_32bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: movzbl (%rsi), %eax ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andb $31, %al +; X64-AVX-NEXT: shlb $3, %al +; X64-AVX-NEXT: andb $24, %al ; X64-AVX-NEXT: negb %al ; X64-AVX-NEXT: movsbq %al, %rax -; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1 +; X64-AVX-NEXT: vmovups -40(%rsp,%rax), %xmm0 +; X64-AVX-NEXT: vmovups -24(%rsp,%rax), %xmm1 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; -; X86-SSE2-LABEL: shl_32bytes: +; X86-SSE2-LABEL: shl_32bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $72, %esp -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: movl (%edi), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%edi), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%edi), %esi -; X86-SSE2-NEXT: movl 12(%edi), %ebx -; X86-SSE2-NEXT: movl 16(%edi), %ebp -; X86-SSE2-NEXT: movzbl (%eax), %eax -; X86-SSE2-NEXT: movl 20(%edi), %edx -; X86-SSE2-NEXT: movl 24(%edi), %ecx -; X86-SSE2-NEXT: movl 28(%edi), %edi -; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-SSE2-NEXT: movl (%ebp), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 4(%ebp), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 8(%ebp), %esi +; X86-SSE2-NEXT: movl 12(%ebp), %edi +; X86-SSE2-NEXT: movl 16(%ebp), %ebx +; X86-SSE2-NEXT: movzbl (%ecx), %ecx +; X86-SSE2-NEXT: movl 20(%ebp), %edx +; X86-SSE2-NEXT: movl 24(%ebp), %eax +; X86-SSE2-NEXT: movl 28(%ebp), %ebp ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andb $31, %al -; X86-SSE2-NEXT: negb %al -; X86-SSE2-NEXT: movsbl %al, %edx -; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: shlb $3, %cl +; X86-SSE2-NEXT: andb $24, %cl +; X86-SSE2-NEXT: negb %cl +; X86-SSE2-NEXT: movsbl %cl, %edx +; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax -; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi -; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi -; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx -; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp -; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx -; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx +; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi +; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi +; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx +; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp +; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx +; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %edx, 24(%eax) ; X86-SSE2-NEXT: movl %ecx, 28(%eax) @@ -1368,18 +8741,18 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $72, %esp +; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: shl_32bytes: +; X86-SSE42-LABEL: shl_32bytes_qwordOff: ; X86-SSE42: # %bb.0: -; X86-SSE42-NEXT: subl $64, %esp +; X86-SSE42-NEXT: subl $76, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1387,23 +8760,24 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm2, %xmm2 -; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm2, (%esp) -; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: andb $31, %cl +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm2, (%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: shlb $3, %cl +; X86-SSE42-NEXT: andb $24, %cl ; X86-SSE42-NEXT: negb %cl ; X86-SSE42-NEXT: movsbl %cl, %ecx ; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0 ; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) -; X86-SSE42-NEXT: addl $64, %esp +; X86-SSE42-NEXT: addl $76, %esp ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: shl_32bytes: +; X86-AVX-LABEL: shl_32bytes_qwordOff: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: subl $64, %esp +; X86-AVX-NEXT: subl $76, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1412,25 +8786,3037 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovups %ymm1, (%esp) ; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andb $31, %cl +; X86-AVX-NEXT: shlb $3, %cl +; X86-AVX-NEXT: andb $24, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx ; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 ; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) -; X86-AVX-NEXT: addl $64, %esp +; X86-AVX-NEXT: addl $76, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %qwordOff = load i256, ptr %qwordOff.ptr, align 1 + %bitOff = shl i256 %qwordOff, 6 %res = shl i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } + define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; X64-SSE2-LABEL: ashr_32bytes: +; FALLBACK0-LABEL: ashr_32bytes: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rcx +; FALLBACK0-NEXT: movq 8(%rdi), %r8 +; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movq 24(%rdi), %rdi +; FALLBACK0-NEXT: movzbl (%rsi), %esi +; FALLBACK0-NEXT: leal (,%rsi,8), %eax +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: sarq $63, %rdi +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: andb $24, %sil +; FALLBACK0-NEXT: movzbl %sil, %r9d +; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi +; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx +; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq %r11, %r8 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %rdi +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: sarq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, 24(%rdx) +; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %rdi, (%rdx) +; FALLBACK0-NEXT: movq %r8, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: ashr_32bytes: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %rdi +; FALLBACK1-NEXT: movzbl (%rsi), %esi +; FALLBACK1-NEXT: leal (,%rsi,8), %ecx +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: sarq $63, %rdi +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: andb $24, %sil +; FALLBACK1-NEXT: movzbl %sil, %eax +; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK1-NEXT: movq %r8, %r9 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK1-NEXT: sarq %cl, %rax +; FALLBACK1-NEXT: movq %rsi, 16(%rdx) +; FALLBACK1-NEXT: movq %rax, 24(%rdx) +; FALLBACK1-NEXT: movq %rdi, (%rdx) +; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: ashr_32bytes: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %rdi +; FALLBACK2-NEXT: movzbl (%rsi), %esi +; FALLBACK2-NEXT: leal (,%rsi,8), %eax +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: sarq $63, %rdi +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: andb $24, %sil +; FALLBACK2-NEXT: movzbl %sil, %ecx +; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi +; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: addq %rsi, %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: addq %rcx, %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: orq %r10, %rax +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %rax, 16(%rdx) +; FALLBACK2-NEXT: movq %rsi, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: ashr_32bytes: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %rdi +; FALLBACK3-NEXT: movzbl (%rsi), %esi +; FALLBACK3-NEXT: leal (,%rsi,8), %ecx +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: sarq $63, %rdi +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: andb $24, %sil +; FALLBACK3-NEXT: movzbl %sil, %eax +; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK3-NEXT: movq %r8, %r9 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax +; FALLBACK3-NEXT: movq %rsi, 16(%rdx) +; FALLBACK3-NEXT: movq %rax, 24(%rdx) +; FALLBACK3-NEXT: movq %rdi, (%rdx) +; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: ashr_32bytes: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movq 16(%rdi), %rcx +; FALLBACK4-NEXT: movq 24(%rdi), %rdi +; FALLBACK4-NEXT: movzbl (%rsi), %esi +; FALLBACK4-NEXT: leal (,%rsi,8), %eax +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: sarq $63, %rdi +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: andb $24, %sil +; FALLBACK4-NEXT: movzbl %sil, %r9d +; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rdi +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 +; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r8 +; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: sarq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, 24(%rdx) +; FALLBACK4-NEXT: movq %r10, 8(%rdx) +; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %rdi, (%rdx) +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: ashr_32bytes: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movq 16(%rdi), %rax +; FALLBACK5-NEXT: movq 24(%rdi), %rdi +; FALLBACK5-NEXT: movzbl (%rsi), %esi +; FALLBACK5-NEXT: leal (,%rsi,8), %ecx +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: sarq $63, %rdi +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: andb $24, %sil +; FALLBACK5-NEXT: movzbl %sil, %eax +; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK5-NEXT: movq %rdi, %r8 +; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK5-NEXT: sarq %cl, %rsi +; FALLBACK5-NEXT: movq %r10, 8(%rdx) +; FALLBACK5-NEXT: movq %r8, 16(%rdx) +; FALLBACK5-NEXT: movq %rsi, 24(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: ashr_32bytes: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movq 16(%rdi), %rcx +; FALLBACK6-NEXT: movq 24(%rdi), %rdi +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: leal (,%rsi,8), %eax +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: sarq $63, %rdi +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: movzbl %sil, %ecx +; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: addq %rdi, %rdi +; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK6-NEXT: orq %rsi, %rdi +; FALLBACK6-NEXT: addq %rcx, %rcx +; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK6-NEXT: orq %r9, %rcx +; FALLBACK6-NEXT: addq %r8, %r8 +; FALLBACK6-NEXT: shlxq %rax, %r8, %rax +; FALLBACK6-NEXT: orq %r10, %rax +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %rax, 8(%rdx) +; FALLBACK6-NEXT: movq %rcx, 16(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: ashr_32bytes: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movq 16(%rdi), %rax +; FALLBACK7-NEXT: movq 24(%rdi), %rdi +; FALLBACK7-NEXT: movzbl (%rsi), %esi +; FALLBACK7-NEXT: leal (,%rsi,8), %ecx +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: sarq $63, %rdi +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: andb $24, %sil +; FALLBACK7-NEXT: movzbl %sil, %eax +; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK7-NEXT: movq %rdi, %r8 +; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq %r10, 8(%rdx) +; FALLBACK7-NEXT: movq %r8, 16(%rdx) +; FALLBACK7-NEXT: movq %rax, 24(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: ashr_32bytes: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK8-NEXT: movq 16(%rdi), %rcx +; FALLBACK8-NEXT: movq 24(%rdi), %rdi +; FALLBACK8-NEXT: movzbl (%rsi), %esi +; FALLBACK8-NEXT: leal (,%rsi,8), %eax +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: sarq $63, %rdi +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: andb $24, %sil +; FALLBACK8-NEXT: movzbl %sil, %r9d +; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rdi +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 +; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: sarq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, 24(%rdx) +; FALLBACK8-NEXT: movq %r10, 8(%rdx) +; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %rdi, (%rdx) +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: ashr_32bytes: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK9-NEXT: movq 16(%rdi), %rax +; FALLBACK9-NEXT: movq 24(%rdi), %rdi +; FALLBACK9-NEXT: movzbl (%rsi), %esi +; FALLBACK9-NEXT: leal (,%rsi,8), %ecx +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: sarq $63, %rdi +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: andb $24, %sil +; FALLBACK9-NEXT: movzbl %sil, %eax +; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK9-NEXT: movq %rdi, %r8 +; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK9-NEXT: sarq %cl, %rsi +; FALLBACK9-NEXT: movq %r10, 8(%rdx) +; FALLBACK9-NEXT: movq %r8, 16(%rdx) +; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: ashr_32bytes: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK10-NEXT: movq 16(%rdi), %rcx +; FALLBACK10-NEXT: movq 24(%rdi), %rdi +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: sarq $63, %rdi +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: movzbl %sil, %ecx +; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: addq %rdi, %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %rsi, %rdi +; FALLBACK10-NEXT: addq %rcx, %rcx +; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK10-NEXT: orq %r9, %rcx +; FALLBACK10-NEXT: addq %r8, %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %rcx, 16(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: ashr_32bytes: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK11-NEXT: movq 16(%rdi), %rax +; FALLBACK11-NEXT: movq 24(%rdi), %rdi +; FALLBACK11-NEXT: movzbl (%rsi), %esi +; FALLBACK11-NEXT: leal (,%rsi,8), %ecx +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: sarq $63, %rdi +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: andb $24, %sil +; FALLBACK11-NEXT: movzbl %sil, %eax +; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK11-NEXT: movq %rdi, %r8 +; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq %r10, 8(%rdx) +; FALLBACK11-NEXT: movq %r8, 16(%rdx) +; FALLBACK11-NEXT: movq %rax, 24(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: ashr_32bytes: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK12-NEXT: movq 16(%rdi), %rcx +; FALLBACK12-NEXT: movq 24(%rdi), %rdi +; FALLBACK12-NEXT: movzbl (%rsi), %esi +; FALLBACK12-NEXT: leal (,%rsi,8), %eax +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: sarq $63, %rdi +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: andb $24, %sil +; FALLBACK12-NEXT: movzbl %sil, %r9d +; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 +; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rdi +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 +; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: sarq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, 24(%rdx) +; FALLBACK12-NEXT: movq %r10, 8(%rdx) +; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %rdi, (%rdx) +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: ashr_32bytes: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK13-NEXT: movq 16(%rdi), %rax +; FALLBACK13-NEXT: movq 24(%rdi), %rdi +; FALLBACK13-NEXT: movzbl (%rsi), %esi +; FALLBACK13-NEXT: leal (,%rsi,8), %ecx +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: sarq $63, %rdi +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: andb $24, %sil +; FALLBACK13-NEXT: movzbl %sil, %eax +; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK13-NEXT: movq %rdi, %r8 +; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK13-NEXT: sarq %cl, %rsi +; FALLBACK13-NEXT: movq %r10, 8(%rdx) +; FALLBACK13-NEXT: movq %r8, 16(%rdx) +; FALLBACK13-NEXT: movq %rsi, 24(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: ashr_32bytes: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK14-NEXT: movq 16(%rdi), %rcx +; FALLBACK14-NEXT: movq 24(%rdi), %rdi +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: leal (,%rsi,8), %eax +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: sarq $63, %rdi +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: movzbl %sil, %ecx +; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi +; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi +; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: addq %rdi, %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %rsi, %rdi +; FALLBACK14-NEXT: addq %rcx, %rcx +; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK14-NEXT: orq %r9, %rcx +; FALLBACK14-NEXT: addq %r8, %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %rax +; FALLBACK14-NEXT: orq %r10, %rax +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rcx, 16(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: ashr_32bytes: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK15-NEXT: movq 16(%rdi), %rax +; FALLBACK15-NEXT: movq 24(%rdi), %rdi +; FALLBACK15-NEXT: movzbl (%rsi), %esi +; FALLBACK15-NEXT: leal (,%rsi,8), %ecx +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: sarq $63, %rdi +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: andb $24, %sil +; FALLBACK15-NEXT: movzbl %sil, %eax +; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi +; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi +; FALLBACK15-NEXT: movq %rdi, %r8 +; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax +; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq %r10, 8(%rdx) +; FALLBACK15-NEXT: movq %r8, 16(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: retq +; +; FALLBACK16-LABEL: ashr_32bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $108, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK16-NEXT: movl (%esi), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%esi), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 8(%esi), %ebx +; FALLBACK16-NEXT: movl 12(%esi), %ebp +; FALLBACK16-NEXT: movl 16(%esi), %edi +; FALLBACK16-NEXT: movzbl (%eax), %ecx +; FALLBACK16-NEXT: movl 20(%esi), %edx +; FALLBACK16-NEXT: movl 24(%esi), %eax +; FALLBACK16-NEXT: movl 28(%esi), %esi +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, %edx +; FALLBACK16-NEXT: shlb $3, %dl +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: sarl $31, %esi +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: andb $28, %cl +; FALLBACK16-NEXT: movzbl %cl, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi +; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax +; FALLBACK16-NEXT: movl %eax, %ebx +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movb %dl, %ch +; FALLBACK16-NEXT: notb %ch +; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: addl %eax, %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp +; FALLBACK16-NEXT: movl %ebp, %esi +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movl %edx, %ebx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%edx,%edx), %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %edx +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: addl %ebp, %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi +; FALLBACK16-NEXT: movl %edi, %eax +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %eax, %edi +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax +; FALLBACK16-NEXT: leal (%eax,%eax), %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %ebx, %edx +; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; FALLBACK16-NEXT: sarl %cl, %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movl %eax, 28(%ecx) +; FALLBACK16-NEXT: movl %edx, 24(%ecx) +; FALLBACK16-NEXT: movl %edi, 16(%ecx) +; FALLBACK16-NEXT: movl %esi, 20(%ecx) +; FALLBACK16-NEXT: movl %ebp, 8(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 12(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, (%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 4(%ecx) +; FALLBACK16-NEXT: addl $108, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: ashr_32bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $92, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl (%ecx), %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%ecx), %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 8(%ecx), %edx +; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 12(%ecx), %ebp +; FALLBACK17-NEXT: movl 16(%ecx), %ebx +; FALLBACK17-NEXT: movzbl (%eax), %eax +; FALLBACK17-NEXT: movl 20(%ecx), %edi +; FALLBACK17-NEXT: movl 24(%ecx), %edx +; FALLBACK17-NEXT: movl 28(%ecx), %esi +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, %ecx +; FALLBACK17-NEXT: shlb $3, %cl +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: sarl $31, %esi +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: andb $28, %al +; FALLBACK17-NEXT: movzbl %al, %ebp +; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %edx, %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx +; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edi +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl %edx, 24(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: sarl %cl, %eax +; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl %ebx, 16(%ebp) +; FALLBACK17-NEXT: movl %edi, 20(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ebp) +; FALLBACK17-NEXT: movl %esi, (%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: addl $92, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: ashr_32bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $108, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi +; FALLBACK18-NEXT: movl (%esi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%esi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%esi), %ebx +; FALLBACK18-NEXT: movl 12(%esi), %ebp +; FALLBACK18-NEXT: movl 16(%esi), %edi +; FALLBACK18-NEXT: movzbl (%ecx), %ecx +; FALLBACK18-NEXT: movl 20(%esi), %edx +; FALLBACK18-NEXT: movl 24(%esi), %eax +; FALLBACK18-NEXT: movl 28(%esi), %esi +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: sarl $31, %esi +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: andb $28, %cl +; FALLBACK18-NEXT: movzbl %cl, %edi +; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx +; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx +; FALLBACK18-NEXT: movl %eax, %edx +; FALLBACK18-NEXT: notb %dl +; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp +; FALLBACK18-NEXT: orl %ebx, %ebp +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %esi +; FALLBACK18-NEXT: orl %ebx, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi +; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx +; FALLBACK18-NEXT: orl %ebx, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx +; FALLBACK18-NEXT: movl %eax, %ebx +; FALLBACK18-NEXT: addl %ebp, %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK18-NEXT: orl %esi, %ecx +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %eax, %eax +; FALLBACK18-NEXT: shlxl %edx, %eax, %esi +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edx +; FALLBACK18-NEXT: orl %eax, %edx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl %ebx, 28(%eax) +; FALLBACK18-NEXT: movl %edx, 24(%eax) +; FALLBACK18-NEXT: movl %esi, 16(%eax) +; FALLBACK18-NEXT: movl %ecx, 20(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 8(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl $108, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: ashr_32bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $92, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl (%ecx), %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%ecx), %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%ecx), %edx +; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 12(%ecx), %ebp +; FALLBACK19-NEXT: movl 16(%ecx), %ebx +; FALLBACK19-NEXT: movzbl (%eax), %eax +; FALLBACK19-NEXT: movl 20(%ecx), %edi +; FALLBACK19-NEXT: movl 24(%ecx), %edx +; FALLBACK19-NEXT: movl 28(%ecx), %esi +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, %ecx +; FALLBACK19-NEXT: shlb $3, %cl +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: sarl $31, %esi +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: andb $28, %al +; FALLBACK19-NEXT: movzbl %al, %ebp +; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %esi, %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx +; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx +; FALLBACK19-NEXT: movl %edx, %esi +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx +; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi +; FALLBACK19-NEXT: shrdl %cl, %edi, %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl %eax, 24(%ebp) +; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax +; FALLBACK19-NEXT: movl %eax, 28(%ebp) +; FALLBACK19-NEXT: movl %ebx, 16(%ebp) +; FALLBACK19-NEXT: movl %esi, 20(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 8(%ebp) +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: movl %edx, (%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 4(%ebp) +; FALLBACK19-NEXT: addl $92, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: ashr_32bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $108, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movl 16(%ecx), %esi +; FALLBACK20-NEXT: movl 20(%ecx), %edi +; FALLBACK20-NEXT: movl 24(%ecx), %ebx +; FALLBACK20-NEXT: movl 28(%ecx), %edx +; FALLBACK20-NEXT: movzbl (%eax), %eax +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shlb $3, %cl +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: sarl $31, %edx +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: andb $28, %al +; FALLBACK20-NEXT: movzbl %al, %edi +; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax +; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl %ecx, %edx +; FALLBACK20-NEXT: movb %cl, %dh +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %eax, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, %eax +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %eax, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %eax, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK20-NEXT: movl %ebp, %eax +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %eax, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: addl %ebp, %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %eax, %ebp +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK20-NEXT: leal (%eax,%eax), %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movb %dh, %cl +; FALLBACK20-NEXT: sarl %cl, %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movl %eax, 28(%ecx) +; FALLBACK20-NEXT: movl %esi, 4(%ecx) +; FALLBACK20-NEXT: movl %edi, 24(%ecx) +; FALLBACK20-NEXT: movl %ebp, 16(%ecx) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, 20(%ecx) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, 8(%ecx) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, 12(%ecx) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, (%ecx) +; FALLBACK20-NEXT: addl $108, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: ashr_32bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $108, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movl 16(%ecx), %esi +; FALLBACK21-NEXT: movl 20(%ecx), %edi +; FALLBACK21-NEXT: movl 24(%ecx), %ebx +; FALLBACK21-NEXT: movl 28(%ecx), %edx +; FALLBACK21-NEXT: movzbl (%eax), %eax +; FALLBACK21-NEXT: movl %eax, %ecx +; FALLBACK21-NEXT: shlb $3, %cl +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: sarl $31, %edx +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: andb $28, %al +; FALLBACK21-NEXT: movzbl %al, %ebp +; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx +; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl %edi, %esi +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %esi, 4(%ebp) +; FALLBACK21-NEXT: movl %ebx, 24(%ebp) +; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: sarl %cl, %eax +; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 12(%ebp) +; FALLBACK21-NEXT: movl %edx, (%ebp) +; FALLBACK21-NEXT: addl $108, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: ashr_32bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $108, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movl 16(%ecx), %esi +; FALLBACK22-NEXT: movl 20(%ecx), %edi +; FALLBACK22-NEXT: movl 24(%ecx), %ebx +; FALLBACK22-NEXT: movl 28(%ecx), %edx +; FALLBACK22-NEXT: movzbl (%eax), %ecx +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: sarl $31, %edx +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: andb $28, %cl +; FALLBACK22-NEXT: movzbl %cl, %edi +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx +; FALLBACK22-NEXT: movl %eax, %edx +; FALLBACK22-NEXT: notb %dl +; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %esi +; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %ecx, %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK22-NEXT: orl %ebx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %ecx, %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: orl %ebx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebx +; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax +; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx +; FALLBACK22-NEXT: orl %ebp, %ebx +; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx +; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK22-NEXT: sarxl %eax, %edi, %eax +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi +; FALLBACK22-NEXT: orl %ecx, %edi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: addl %ecx, %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK22-NEXT: orl %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %eax, 28(%edx) +; FALLBACK22-NEXT: movl %ecx, 4(%edx) +; FALLBACK22-NEXT: movl %edi, 24(%edx) +; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, (%edx) +; FALLBACK22-NEXT: addl $108, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: ashr_32bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $108, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movl 16(%ecx), %esi +; FALLBACK23-NEXT: movl 20(%ecx), %edi +; FALLBACK23-NEXT: movl 24(%ecx), %ebx +; FALLBACK23-NEXT: movl 28(%ecx), %edx +; FALLBACK23-NEXT: movzbl (%eax), %eax +; FALLBACK23-NEXT: movl %eax, %ecx +; FALLBACK23-NEXT: shlb $3, %cl +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: sarl $31, %edx +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: andb $28, %al +; FALLBACK23-NEXT: movzbl %al, %ebx +; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, %edi +; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx +; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl %ebx, 4(%eax) +; FALLBACK23-NEXT: movl %ebp, 24(%eax) +; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK23-NEXT: movl %ebx, 28(%eax) +; FALLBACK23-NEXT: movl %esi, 16(%eax) +; FALLBACK23-NEXT: movl %edi, 20(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: movl %esi, 8(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: movl %esi, 12(%eax) +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, (%eax) +; FALLBACK23-NEXT: addl $108, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: ashr_32bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $108, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK24-NEXT: movl 16(%ecx), %esi +; FALLBACK24-NEXT: movl 20(%ecx), %edi +; FALLBACK24-NEXT: movl 24(%ecx), %ebx +; FALLBACK24-NEXT: movl 28(%ecx), %edx +; FALLBACK24-NEXT: movzbl (%eax), %eax +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shlb $3, %cl +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: sarl $31, %edx +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: andb $28, %al +; FALLBACK24-NEXT: movzbl %al, %edi +; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax +; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl %ecx, %edx +; FALLBACK24-NEXT: movb %cl, %dh +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %eax, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx +; FALLBACK24-NEXT: movl %ebx, %eax +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %eax, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %eax, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK24-NEXT: movl %ebp, %eax +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %eax, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: addl %ebp, %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %eax, %ebp +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK24-NEXT: leal (%eax,%eax), %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movb %dh, %cl +; FALLBACK24-NEXT: sarl %cl, %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: movl %eax, 28(%ecx) +; FALLBACK24-NEXT: movl %esi, 4(%ecx) +; FALLBACK24-NEXT: movl %edi, 24(%ecx) +; FALLBACK24-NEXT: movl %ebp, 16(%ecx) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, 20(%ecx) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, 8(%ecx) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, 12(%ecx) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, (%ecx) +; FALLBACK24-NEXT: addl $108, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: ashr_32bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $108, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK25-NEXT: movl 16(%ecx), %esi +; FALLBACK25-NEXT: movl 20(%ecx), %edi +; FALLBACK25-NEXT: movl 24(%ecx), %ebx +; FALLBACK25-NEXT: movl 28(%ecx), %edx +; FALLBACK25-NEXT: movzbl (%eax), %eax +; FALLBACK25-NEXT: movl %eax, %ecx +; FALLBACK25-NEXT: shlb $3, %cl +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: sarl $31, %edx +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: andb $28, %al +; FALLBACK25-NEXT: movzbl %al, %ebp +; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx +; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl %edi, %esi +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %esi, 4(%ebp) +; FALLBACK25-NEXT: movl %ebx, 24(%ebp) +; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: sarl %cl, %eax +; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 12(%ebp) +; FALLBACK25-NEXT: movl %edx, (%ebp) +; FALLBACK25-NEXT: addl $108, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: ashr_32bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $108, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK26-NEXT: movl 16(%ecx), %esi +; FALLBACK26-NEXT: movl 20(%ecx), %edi +; FALLBACK26-NEXT: movl 24(%ecx), %ebx +; FALLBACK26-NEXT: movl 28(%ecx), %edx +; FALLBACK26-NEXT: movzbl (%eax), %ecx +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: shlb $3, %al +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: sarl $31, %edx +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: andb $28, %cl +; FALLBACK26-NEXT: movzbl %cl, %edi +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx +; FALLBACK26-NEXT: movl %eax, %edx +; FALLBACK26-NEXT: notb %dl +; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %esi +; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK26-NEXT: orl %ebx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: orl %ebx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebx +; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax +; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx +; FALLBACK26-NEXT: orl %ebp, %ebx +; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx +; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK26-NEXT: sarxl %eax, %edi, %eax +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi +; FALLBACK26-NEXT: orl %ecx, %edi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %eax, 28(%edx) +; FALLBACK26-NEXT: movl %ecx, 4(%edx) +; FALLBACK26-NEXT: movl %edi, 24(%edx) +; FALLBACK26-NEXT: movl %ebx, 16(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, (%edx) +; FALLBACK26-NEXT: addl $108, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: ashr_32bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $108, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK27-NEXT: movl 16(%ecx), %esi +; FALLBACK27-NEXT: movl 20(%ecx), %edi +; FALLBACK27-NEXT: movl 24(%ecx), %ebx +; FALLBACK27-NEXT: movl 28(%ecx), %edx +; FALLBACK27-NEXT: movzbl (%eax), %eax +; FALLBACK27-NEXT: movl %eax, %ecx +; FALLBACK27-NEXT: shlb $3, %cl +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: sarl $31, %edx +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: andb $28, %al +; FALLBACK27-NEXT: movzbl %al, %ebx +; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, %edi +; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx +; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl %ebx, 4(%eax) +; FALLBACK27-NEXT: movl %ebp, 24(%eax) +; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK27-NEXT: movl %ebx, 28(%eax) +; FALLBACK27-NEXT: movl %esi, 16(%eax) +; FALLBACK27-NEXT: movl %edi, 20(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: movl %esi, 8(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: movl %esi, 12(%eax) +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, (%eax) +; FALLBACK27-NEXT: addl $108, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: ashr_32bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $108, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK28-NEXT: movl 16(%ecx), %esi +; FALLBACK28-NEXT: movl 20(%ecx), %edi +; FALLBACK28-NEXT: movl 24(%ecx), %ebx +; FALLBACK28-NEXT: movl 28(%ecx), %edx +; FALLBACK28-NEXT: movzbl (%eax), %eax +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shlb $3, %cl +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: sarl $31, %edx +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: andb $28, %al +; FALLBACK28-NEXT: movzbl %al, %edi +; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax +; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl %ecx, %edx +; FALLBACK28-NEXT: movb %cl, %dh +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %eax, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, %eax +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %eax, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %eax, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp +; FALLBACK28-NEXT: movl %ebp, %eax +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %eax, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: addl %ebp, %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %eax, %ebp +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax +; FALLBACK28-NEXT: leal (%eax,%eax), %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movb %dh, %cl +; FALLBACK28-NEXT: sarl %cl, %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: movl %eax, 28(%ecx) +; FALLBACK28-NEXT: movl %esi, 4(%ecx) +; FALLBACK28-NEXT: movl %edi, 24(%ecx) +; FALLBACK28-NEXT: movl %ebp, 16(%ecx) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, 20(%ecx) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, 8(%ecx) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, 12(%ecx) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, (%ecx) +; FALLBACK28-NEXT: addl $108, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: ashr_32bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $108, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK29-NEXT: movl 16(%ecx), %esi +; FALLBACK29-NEXT: movl 20(%ecx), %edi +; FALLBACK29-NEXT: movl 24(%ecx), %ebx +; FALLBACK29-NEXT: movl 28(%ecx), %edx +; FALLBACK29-NEXT: movzbl (%eax), %eax +; FALLBACK29-NEXT: movl %eax, %ecx +; FALLBACK29-NEXT: shlb $3, %cl +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: sarl $31, %edx +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: andb $28, %al +; FALLBACK29-NEXT: movzbl %al, %ebp +; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx +; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl %edi, %esi +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %esi, 4(%ebp) +; FALLBACK29-NEXT: movl %ebx, 24(%ebp) +; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: sarl %cl, %eax +; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 12(%ebp) +; FALLBACK29-NEXT: movl %edx, (%ebp) +; FALLBACK29-NEXT: addl $108, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: ashr_32bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $108, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK30-NEXT: movl 16(%ecx), %esi +; FALLBACK30-NEXT: movl 20(%ecx), %edi +; FALLBACK30-NEXT: movl 24(%ecx), %ebx +; FALLBACK30-NEXT: movl 28(%ecx), %edx +; FALLBACK30-NEXT: movzbl (%eax), %ecx +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: shlb $3, %al +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: sarl $31, %edx +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: andb $28, %cl +; FALLBACK30-NEXT: movzbl %cl, %edi +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx +; FALLBACK30-NEXT: movl %eax, %edx +; FALLBACK30-NEXT: notb %dl +; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %esi +; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %ecx, %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK30-NEXT: orl %ebx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %ecx, %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: orl %ebx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebx +; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax +; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx +; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx +; FALLBACK30-NEXT: orl %ebp, %ebx +; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx +; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi +; FALLBACK30-NEXT: sarxl %eax, %edi, %eax +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi +; FALLBACK30-NEXT: orl %ecx, %edi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: addl %ecx, %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx +; FALLBACK30-NEXT: orl %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %eax, 28(%edx) +; FALLBACK30-NEXT: movl %ecx, 4(%edx) +; FALLBACK30-NEXT: movl %edi, 24(%edx) +; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, (%edx) +; FALLBACK30-NEXT: addl $108, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: ashr_32bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $108, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %xmm0 +; FALLBACK31-NEXT: movl 16(%ecx), %esi +; FALLBACK31-NEXT: movl 20(%ecx), %edi +; FALLBACK31-NEXT: movl 24(%ecx), %ebx +; FALLBACK31-NEXT: movl 28(%ecx), %edx +; FALLBACK31-NEXT: movzbl (%eax), %eax +; FALLBACK31-NEXT: movl %eax, %ecx +; FALLBACK31-NEXT: shlb $3, %cl +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: sarl $31, %edx +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: andb $28, %al +; FALLBACK31-NEXT: movzbl %al, %ebx +; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi +; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp +; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, %edi +; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp +; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx +; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx +; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl %ebx, 4(%eax) +; FALLBACK31-NEXT: movl %ebp, 24(%eax) +; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; FALLBACK31-NEXT: movl %ebx, 28(%eax) +; FALLBACK31-NEXT: movl %esi, 16(%eax) +; FALLBACK31-NEXT: movl %edi, 20(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: movl %esi, 8(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: movl %esi, 12(%eax) +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, (%eax) +; FALLBACK31-NEXT: addl $108, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: retl + %src = load i256, ptr %src.ptr, align 1 + %byteOff = load i256, ptr %byteOff.ptr, align 1 + %bitOff = shl i256 %byteOff, 3 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { +; FALLBACK0-LABEL: ashr_32bytes_dwordOff: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rcx +; FALLBACK0-NEXT: movq 8(%rdi), %r8 +; FALLBACK0-NEXT: movq 16(%rdi), %r9 +; FALLBACK0-NEXT: movq 24(%rdi), %rdi +; FALLBACK0-NEXT: movzbl (%rsi), %esi +; FALLBACK0-NEXT: movl %esi, %eax +; FALLBACK0-NEXT: shlb $5, %al +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: sarq $63, %rdi +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: andb $6, %sil +; FALLBACK0-NEXT: movzbl %sil, %r9d +; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi +; FALLBACK0-NEXT: movq %rdi, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx +; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq %r11, %r8 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: addq %rdi, %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %rdi +; FALLBACK0-NEXT: orq %r10, %rdi +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: sarq %cl, %r9 +; FALLBACK0-NEXT: movq %r9, 24(%rdx) +; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %rdi, (%rdx) +; FALLBACK0-NEXT: movq %r8, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: ashr_32bytes_dwordOff: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %rdi +; FALLBACK1-NEXT: movzbl (%rsi), %esi +; FALLBACK1-NEXT: movl %esi, %ecx +; FALLBACK1-NEXT: shlb $5, %cl +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: sarq $63, %rdi +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: andb $6, %sil +; FALLBACK1-NEXT: movzbl %sil, %eax +; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi +; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi +; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK1-NEXT: movq %r8, %r9 +; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK1-NEXT: sarq %cl, %rax +; FALLBACK1-NEXT: movq %rsi, 16(%rdx) +; FALLBACK1-NEXT: movq %rax, 24(%rdx) +; FALLBACK1-NEXT: movq %rdi, (%rdx) +; FALLBACK1-NEXT: movq %r9, 8(%rdx) +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: ashr_32bytes_dwordOff: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %rdi +; FALLBACK2-NEXT: movzbl (%rsi), %esi +; FALLBACK2-NEXT: movl %esi, %eax +; FALLBACK2-NEXT: shlb $5, %al +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: sarq $63, %rdi +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: andb $6, %sil +; FALLBACK2-NEXT: movzbl %sil, %ecx +; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi +; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi +; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 +; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 +; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: addq %rsi, %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r9, %rsi +; FALLBACK2-NEXT: addq %rcx, %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax +; FALLBACK2-NEXT: orq %r10, %rax +; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: movq %rax, 16(%rdx) +; FALLBACK2-NEXT: movq %rsi, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: ashr_32bytes_dwordOff: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %rdi +; FALLBACK3-NEXT: movzbl (%rsi), %esi +; FALLBACK3-NEXT: movl %esi, %ecx +; FALLBACK3-NEXT: shlb $5, %cl +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: sarq $63, %rdi +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: andb $6, %sil +; FALLBACK3-NEXT: movzbl %sil, %eax +; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi +; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi +; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK3-NEXT: movq %r8, %r9 +; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 +; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi +; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi +; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax +; FALLBACK3-NEXT: movq %rsi, 16(%rdx) +; FALLBACK3-NEXT: movq %rax, 24(%rdx) +; FALLBACK3-NEXT: movq %rdi, (%rdx) +; FALLBACK3-NEXT: movq %r9, 8(%rdx) +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: ashr_32bytes_dwordOff: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movq 16(%rdi), %rcx +; FALLBACK4-NEXT: movq 24(%rdi), %rdi +; FALLBACK4-NEXT: movzbl (%rsi), %esi +; FALLBACK4-NEXT: movl %esi, %eax +; FALLBACK4-NEXT: shlb $5, %al +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: sarq $63, %rdi +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: andb $6, %sil +; FALLBACK4-NEXT: movzbl %sil, %r9d +; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rdi +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 +; FALLBACK4-NEXT: movq %r10, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: orq %r11, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r8 +; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %r8, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: sarq %cl, %r9 +; FALLBACK4-NEXT: movq %r9, 24(%rdx) +; FALLBACK4-NEXT: movq %r10, 8(%rdx) +; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %rdi, (%rdx) +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: ashr_32bytes_dwordOff: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movq 16(%rdi), %rax +; FALLBACK5-NEXT: movq 24(%rdi), %rdi +; FALLBACK5-NEXT: movzbl (%rsi), %esi +; FALLBACK5-NEXT: movl %esi, %ecx +; FALLBACK5-NEXT: shlb $5, %cl +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: sarq $63, %rdi +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: andb $6, %sil +; FALLBACK5-NEXT: movzbl %sil, %eax +; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK5-NEXT: movq %rdi, %r8 +; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK5-NEXT: movq %rax, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK5-NEXT: sarq %cl, %rsi +; FALLBACK5-NEXT: movq %r10, 8(%rdx) +; FALLBACK5-NEXT: movq %r8, 16(%rdx) +; FALLBACK5-NEXT: movq %rsi, 24(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: ashr_32bytes_dwordOff: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movq 16(%rdi), %rcx +; FALLBACK6-NEXT: movq 24(%rdi), %rdi +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: movl %esi, %eax +; FALLBACK6-NEXT: shlb $5, %al +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: sarq $63, %rdi +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: andb $6, %sil +; FALLBACK6-NEXT: movzbl %sil, %ecx +; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: addq %rdi, %rdi +; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK6-NEXT: orq %rsi, %rdi +; FALLBACK6-NEXT: addq %rcx, %rcx +; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK6-NEXT: orq %r9, %rcx +; FALLBACK6-NEXT: addq %r8, %r8 +; FALLBACK6-NEXT: shlxq %rax, %r8, %rax +; FALLBACK6-NEXT: orq %r10, %rax +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %rax, 8(%rdx) +; FALLBACK6-NEXT: movq %rcx, 16(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: ashr_32bytes_dwordOff: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movq 16(%rdi), %rax +; FALLBACK7-NEXT: movq 24(%rdi), %rdi +; FALLBACK7-NEXT: movzbl (%rsi), %esi +; FALLBACK7-NEXT: movl %esi, %ecx +; FALLBACK7-NEXT: shlb $5, %cl +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: sarq $63, %rdi +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: andb $6, %sil +; FALLBACK7-NEXT: movzbl %sil, %eax +; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK7-NEXT: movq %rdi, %r8 +; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK7-NEXT: movq %rax, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK7-NEXT: movq %r10, 8(%rdx) +; FALLBACK7-NEXT: movq %r8, 16(%rdx) +; FALLBACK7-NEXT: movq %rax, 24(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: ashr_32bytes_dwordOff: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK8-NEXT: movq 16(%rdi), %rcx +; FALLBACK8-NEXT: movq 24(%rdi), %rdi +; FALLBACK8-NEXT: movzbl (%rsi), %esi +; FALLBACK8-NEXT: movl %esi, %eax +; FALLBACK8-NEXT: shlb $5, %al +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: sarq $63, %rdi +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: andb $6, %sil +; FALLBACK8-NEXT: movzbl %sil, %r9d +; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rdi +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 +; FALLBACK8-NEXT: movq %r10, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: orq %r11, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %r8, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: sarq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, 24(%rdx) +; FALLBACK8-NEXT: movq %r10, 8(%rdx) +; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %rdi, (%rdx) +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: ashr_32bytes_dwordOff: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK9-NEXT: movq 16(%rdi), %rax +; FALLBACK9-NEXT: movq 24(%rdi), %rdi +; FALLBACK9-NEXT: movzbl (%rsi), %esi +; FALLBACK9-NEXT: movl %esi, %ecx +; FALLBACK9-NEXT: shlb $5, %cl +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: sarq $63, %rdi +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: andb $6, %sil +; FALLBACK9-NEXT: movzbl %sil, %eax +; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK9-NEXT: movq %rdi, %r8 +; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK9-NEXT: movq %rax, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK9-NEXT: sarq %cl, %rsi +; FALLBACK9-NEXT: movq %r10, 8(%rdx) +; FALLBACK9-NEXT: movq %r8, 16(%rdx) +; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: ashr_32bytes_dwordOff: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK10-NEXT: movq 16(%rdi), %rcx +; FALLBACK10-NEXT: movq 24(%rdi), %rdi +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: movl %esi, %eax +; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: sarq $63, %rdi +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: andb $6, %sil +; FALLBACK10-NEXT: movzbl %sil, %ecx +; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: addq %rdi, %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %rsi, %rdi +; FALLBACK10-NEXT: addq %rcx, %rcx +; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK10-NEXT: orq %r9, %rcx +; FALLBACK10-NEXT: addq %r8, %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %rcx, 16(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: ashr_32bytes_dwordOff: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK11-NEXT: movq 16(%rdi), %rax +; FALLBACK11-NEXT: movq 24(%rdi), %rdi +; FALLBACK11-NEXT: movzbl (%rsi), %esi +; FALLBACK11-NEXT: movl %esi, %ecx +; FALLBACK11-NEXT: shlb $5, %cl +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: sarq $63, %rdi +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: andb $6, %sil +; FALLBACK11-NEXT: movzbl %sil, %eax +; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK11-NEXT: movq %rdi, %r8 +; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK11-NEXT: movq %rax, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK11-NEXT: movq %r10, 8(%rdx) +; FALLBACK11-NEXT: movq %r8, 16(%rdx) +; FALLBACK11-NEXT: movq %rax, 24(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: ashr_32bytes_dwordOff: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK12-NEXT: movq 16(%rdi), %rcx +; FALLBACK12-NEXT: movq 24(%rdi), %rdi +; FALLBACK12-NEXT: movzbl (%rsi), %esi +; FALLBACK12-NEXT: movl %esi, %eax +; FALLBACK12-NEXT: shlb $5, %al +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: sarq $63, %rdi +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: andb $6, %sil +; FALLBACK12-NEXT: movzbl %sil, %r9d +; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 +; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rdi +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 +; FALLBACK12-NEXT: movq %r10, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: orq %r11, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %r8, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: sarq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, 24(%rdx) +; FALLBACK12-NEXT: movq %r10, 8(%rdx) +; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %rdi, (%rdx) +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: ashr_32bytes_dwordOff: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK13-NEXT: movq 16(%rdi), %rax +; FALLBACK13-NEXT: movq 24(%rdi), %rdi +; FALLBACK13-NEXT: movzbl (%rsi), %esi +; FALLBACK13-NEXT: movl %esi, %ecx +; FALLBACK13-NEXT: shlb $5, %cl +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: sarq $63, %rdi +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: andb $6, %sil +; FALLBACK13-NEXT: movzbl %sil, %eax +; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK13-NEXT: movq %rdi, %r8 +; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK13-NEXT: movq %rax, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK13-NEXT: sarq %cl, %rsi +; FALLBACK13-NEXT: movq %r10, 8(%rdx) +; FALLBACK13-NEXT: movq %r8, 16(%rdx) +; FALLBACK13-NEXT: movq %rsi, 24(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: ashr_32bytes_dwordOff: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK14-NEXT: movq 16(%rdi), %rcx +; FALLBACK14-NEXT: movq 24(%rdi), %rdi +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: movl %esi, %eax +; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: sarq $63, %rdi +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: andb $6, %sil +; FALLBACK14-NEXT: movzbl %sil, %ecx +; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi +; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi +; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 +; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 +; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: addq %rdi, %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %rsi, %rdi +; FALLBACK14-NEXT: addq %rcx, %rcx +; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx +; FALLBACK14-NEXT: orq %r9, %rcx +; FALLBACK14-NEXT: addq %r8, %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %rax +; FALLBACK14-NEXT: orq %r10, %rax +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rcx, 16(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: ashr_32bytes_dwordOff: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 +; FALLBACK15-NEXT: movq 16(%rdi), %rax +; FALLBACK15-NEXT: movq 24(%rdi), %rdi +; FALLBACK15-NEXT: movzbl (%rsi), %esi +; FALLBACK15-NEXT: movl %esi, %ecx +; FALLBACK15-NEXT: shlb $5, %cl +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: sarq $63, %rdi +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: andb $6, %sil +; FALLBACK15-NEXT: movzbl %sil, %eax +; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi +; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi +; FALLBACK15-NEXT: movq %rdi, %r8 +; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 +; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 +; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax +; FALLBACK15-NEXT: movq %rax, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 +; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 +; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax +; FALLBACK15-NEXT: movq %r10, 8(%rdx) +; FALLBACK15-NEXT: movq %r8, 16(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: retq +; +; X86-SSE2-LABEL: ashr_32bytes_dwordOff: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: pushl %ebx +; X86-SSE2-NEXT: pushl %edi +; X86-SSE2-NEXT: pushl %esi +; X86-SSE2-NEXT: subl $92, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl (%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 4(%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 8(%eax), %edi +; X86-SSE2-NEXT: movl 12(%eax), %ebx +; X86-SSE2-NEXT: movl 16(%eax), %ebp +; X86-SSE2-NEXT: movl 20(%eax), %esi +; X86-SSE2-NEXT: movl 24(%eax), %edx +; X86-SSE2-NEXT: movl 28(%eax), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzbl (%eax), %eax +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: sarl $31, %ecx +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: andl $7, %eax +; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi +; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi +; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx +; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp +; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx +; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %edx, 28(%eax) +; X86-SSE2-NEXT: movl %ebp, 16(%eax) +; X86-SSE2-NEXT: movl %ebx, 20(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) +; X86-SSE2-NEXT: movl %esi, 12(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: addl $92, %esp +; X86-SSE2-NEXT: popl %esi +; X86-SSE2-NEXT: popl %edi +; X86-SSE2-NEXT: popl %ebx +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: retl +; +; X86-SSE42-LABEL: ashr_32bytes_dwordOff: +; X86-SSE42: # %bb.0: +; X86-SSE42-NEXT: pushl %ebx +; X86-SSE42-NEXT: pushl %edi +; X86-SSE42-NEXT: pushl %esi +; X86-SSE42-NEXT: subl $64, %esp +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE42-NEXT: movups (%edx), %xmm0 +; X86-SSE42-NEXT: movl 16(%edx), %esi +; X86-SSE42-NEXT: movl 20(%edx), %edi +; X86-SSE42-NEXT: movl 24(%edx), %ebx +; X86-SSE42-NEXT: movl 28(%edx), %edx +; X86-SSE42-NEXT: movzbl (%ecx), %ecx +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) +; X86-SSE42-NEXT: sarl $31, %edx +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: andl $7, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 +; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1 +; X86-SSE42-NEXT: movups %xmm1, 16(%eax) +; X86-SSE42-NEXT: movups %xmm0, (%eax) +; X86-SSE42-NEXT: addl $64, %esp +; X86-SSE42-NEXT: popl %esi +; X86-SSE42-NEXT: popl %edi +; X86-SSE42-NEXT: popl %ebx +; X86-SSE42-NEXT: retl +; +; X86-AVX-LABEL: ashr_32bytes_dwordOff: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: pushl %ebx +; X86-AVX-NEXT: pushl %edi +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: subl $64, %esp +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: vmovups (%edx), %xmm0 +; X86-AVX-NEXT: movl 16(%edx), %esi +; X86-AVX-NEXT: movl 20(%edx), %edi +; X86-AVX-NEXT: movl 24(%edx), %ebx +; X86-AVX-NEXT: movl 28(%edx), %edx +; X86-AVX-NEXT: movzbl (%ecx), %ecx +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovaps %xmm0, (%esp) +; X86-AVX-NEXT: sarl $31, %edx +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: andl $7, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 +; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1 +; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) +; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: addl $64, %esp +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: popl %edi +; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: retl + %src = load i256, ptr %src.ptr, align 1 + %dwordOff = load i256, ptr %dwordOff.ptr, align 1 + %bitOff = shl i256 %dwordOff, 5 + %res = ashr i256 %src, %bitOff + store i256 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { +; X64-SSE2-LABEL: ashr_32bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx @@ -1446,18 +11832,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $31, %esi -; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax -; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: andl $3, %esi +; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax +; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx +; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi +; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: retq ; -; X64-SSE42-LABEL: ashr_32bytes: +; X64-SSE42-LABEL: ashr_32bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movq 16(%rdi), %rax @@ -1465,20 +11851,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE42-NEXT: movzbl (%rsi), %esi ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: sarq $63, %rcx ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: andl $31, %esi -; X64-SSE42-NEXT: movups -64(%rsp,%rsi), %xmm0 -; X64-SSE42-NEXT: movups -48(%rsp,%rsi), %xmm1 +; X64-SSE42-NEXT: andl $3, %esi +; X64-SSE42-NEXT: movups -72(%rsp,%rsi,8), %xmm0 +; X64-SSE42-NEXT: movups -56(%rsp,%rsi,8), %xmm1 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_32bytes: +; X64-AVX-LABEL: ashr_32bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %xmm0 ; X64-AVX-NEXT: movq 16(%rdi), %rax @@ -1486,31 +11872,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX-NEXT: movzbl (%rsi), %esi ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: sarq $63, %rcx ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $31, %esi -; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1 +; X64-AVX-NEXT: andl $3, %esi +; X64-AVX-NEXT: vmovups -72(%rsp,%rsi,8), %xmm0 +; X64-AVX-NEXT: vmovups -56(%rsp,%rsi,8), %xmm1 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: retq ; -; X86-SSE2-LABEL: ashr_32bytes: +; X86-SSE2-LABEL: ashr_32bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $72, %esp +; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %edi ; X86-SSE2-NEXT: movl 12(%eax), %ebx ; X86-SSE2-NEXT: movl 16(%eax), %ebp @@ -1525,7 +11911,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -1538,17 +11924,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $31, %eax -; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx +; X86-SSE2-NEXT: andl $3, %eax +; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi +; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi +; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx +; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp +; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx +; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl %edx, 28(%eax) @@ -1558,16 +11944,16 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $72, %esp +; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: ashr_32bytes: +; X86-SSE42-LABEL: ashr_32bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi @@ -1586,7 +11972,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, (%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: sarl $31, %edx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -1596,9 +11982,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: andl $31, %ecx -; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0 -; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1 +; X86-SSE42-NEXT: andl $3, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $64, %esp @@ -1607,7 +11993,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: ashr_32bytes: +; X86-AVX-LABEL: ashr_32bytes_qwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi @@ -1626,7 +12012,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %xmm0, (%esp) +; X86-AVX-NEXT: vmovaps %xmm0, (%esp) ; X86-AVX-NEXT: sarl $31, %edx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -1636,9 +12022,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andl $31, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 +; X86-AVX-NEXT: andl $3, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0 +; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp @@ -1647,15 +12033,3662 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: popl %ebx ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 - %byteOff = load i256, ptr %byteOff.ptr, align 1 - %bitOff = shl i256 %byteOff, 3 + %qwordOff = load i256, ptr %qwordOff.ptr, align 1 + %bitOff = shl i256 %qwordOff, 6 %res = ashr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; X64-SSE2-LABEL: lshr_64bytes: +; FALLBACK0-LABEL: lshr_64bytes: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r15 +; FALLBACK0-NEXT: pushq %r14 +; FALLBACK0-NEXT: pushq %r13 +; FALLBACK0-NEXT: pushq %r12 +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rax +; FALLBACK0-NEXT: movq 8(%rdi), %rcx +; FALLBACK0-NEXT: movq 16(%rdi), %r8 +; FALLBACK0-NEXT: movq 24(%rdi), %r9 +; FALLBACK0-NEXT: movq 32(%rdi), %r10 +; FALLBACK0-NEXT: movq 40(%rdi), %r11 +; FALLBACK0-NEXT: movq 48(%rdi), %rbx +; FALLBACK0-NEXT: movq 56(%rdi), %r14 +; FALLBACK0-NEXT: movl (%rsi), %edi +; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: leal (,%rdi,8), %eax +; FALLBACK0-NEXT: andl $56, %eax +; FALLBACK0-NEXT: andl $56, %edi +; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 +; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 +; FALLBACK0-NEXT: movq %r8, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx +; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: orq %r11, %r9 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: addq %r8, %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq %r10, %r8 +; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 +; FALLBACK0-NEXT: movq %r10, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r15 +; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 +; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: orq %r15, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: addq %r10, %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx +; FALLBACK0-NEXT: movq %rbx, %r12 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r12 +; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 +; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r15 +; FALLBACK0-NEXT: orq %r12, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r14 +; FALLBACK0-NEXT: addq %rbx, %rbx +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %rbx +; FALLBACK0-NEXT: orq %r14, %rbx +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r13 +; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi +; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: orq %r13, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: movq %rdi, 56(%rdx) +; FALLBACK0-NEXT: movq %r14, 48(%rdx) +; FALLBACK0-NEXT: movq %rbx, 32(%rdx) +; FALLBACK0-NEXT: movq %r15, 40(%rdx) +; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 24(%rdx) +; FALLBACK0-NEXT: movq %r8, (%rdx) +; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r12 +; FALLBACK0-NEXT: popq %r13 +; FALLBACK0-NEXT: popq %r14 +; FALLBACK0-NEXT: popq %r15 +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: lshr_64bytes: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: pushq %r15 +; FALLBACK1-NEXT: pushq %r14 +; FALLBACK1-NEXT: pushq %rbx +; FALLBACK1-NEXT: movq (%rdi), %rcx +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %r10 +; FALLBACK1-NEXT: movq 32(%rdi), %r11 +; FALLBACK1-NEXT: movq 40(%rdi), %rbx +; FALLBACK1-NEXT: movq 48(%rdi), %r14 +; FALLBACK1-NEXT: movq 56(%rdi), %rdi +; FALLBACK1-NEXT: movl (%rsi), %eax +; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: leal (,%rax,8), %ecx +; FALLBACK1-NEXT: andl $56, %ecx +; FALLBACK1-NEXT: andl $56, %eax +; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq %r9, %r8 +; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 +; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 +; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK1-NEXT: movq %r11, %rbx +; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 +; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK1-NEXT: movq %r14, %r15 +; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 +; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 +; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK1-NEXT: shrq %cl, %rax +; FALLBACK1-NEXT: movq %r11, 48(%rdx) +; FALLBACK1-NEXT: movq %rax, 56(%rdx) +; FALLBACK1-NEXT: movq %r10, 32(%rdx) +; FALLBACK1-NEXT: movq %r15, 40(%rdx) +; FALLBACK1-NEXT: movq %rdi, 16(%rdx) +; FALLBACK1-NEXT: movq %rbx, 24(%rdx) +; FALLBACK1-NEXT: movq %rsi, (%rdx) +; FALLBACK1-NEXT: movq %r8, 8(%rdx) +; FALLBACK1-NEXT: popq %rbx +; FALLBACK1-NEXT: popq %r14 +; FALLBACK1-NEXT: popq %r15 +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: lshr_64bytes: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: pushq %rbp +; FALLBACK2-NEXT: pushq %r15 +; FALLBACK2-NEXT: pushq %r14 +; FALLBACK2-NEXT: pushq %r13 +; FALLBACK2-NEXT: pushq %r12 +; FALLBACK2-NEXT: pushq %rbx +; FALLBACK2-NEXT: pushq %rax +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %r10 +; FALLBACK2-NEXT: movq 32(%rdi), %r11 +; FALLBACK2-NEXT: movq 40(%rdi), %rbx +; FALLBACK2-NEXT: movq 48(%rdi), %r14 +; FALLBACK2-NEXT: movq 56(%rdi), %rdi +; FALLBACK2-NEXT: movl (%rsi), %eax +; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: leal (,%rax,8), %ecx +; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx +; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 +; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp +; FALLBACK2-NEXT: movl %ecx, %r12d +; FALLBACK2-NEXT: notb %r12b +; FALLBACK2-NEXT: addq %r9, %r9 +; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: orq %rbx, %r9 +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi +; FALLBACK2-NEXT: orq %r13, %rdi +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx +; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: addq %r10, %r10 +; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 +; FALLBACK2-NEXT: orq %r8, %r10 +; FALLBACK2-NEXT: addq %rsi, %rsi +; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi +; FALLBACK2-NEXT: orq %r11, %rsi +; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 +; FALLBACK2-NEXT: orq %r15, %r8 +; FALLBACK2-NEXT: addq %r14, %r14 +; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 +; FALLBACK2-NEXT: orq %rbp, %r11 +; FALLBACK2-NEXT: addq %rax, %rax +; FALLBACK2-NEXT: shlxq %r12, %rax, %rax +; FALLBACK2-NEXT: orq %r13, %rax +; FALLBACK2-NEXT: movq %rcx, 56(%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) +; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r8, 40(%rdx) +; FALLBACK2-NEXT: movq %rsi, 16(%rdx) +; FALLBACK2-NEXT: movq %r10, 24(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: addq $8, %rsp +; FALLBACK2-NEXT: popq %rbx +; FALLBACK2-NEXT: popq %r12 +; FALLBACK2-NEXT: popq %r13 +; FALLBACK2-NEXT: popq %r14 +; FALLBACK2-NEXT: popq %r15 +; FALLBACK2-NEXT: popq %rbp +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: lshr_64bytes: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: pushq %r15 +; FALLBACK3-NEXT: pushq %r14 +; FALLBACK3-NEXT: pushq %rbx +; FALLBACK3-NEXT: movq (%rdi), %rcx +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %r10 +; FALLBACK3-NEXT: movq 32(%rdi), %r11 +; FALLBACK3-NEXT: movq 40(%rdi), %rbx +; FALLBACK3-NEXT: movq 48(%rdi), %r14 +; FALLBACK3-NEXT: movq 56(%rdi), %rdi +; FALLBACK3-NEXT: movl (%rsi), %eax +; FALLBACK3-NEXT: xorps %xmm0, %xmm0 +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: leal (,%rax,8), %ecx +; FALLBACK3-NEXT: andl $56, %ecx +; FALLBACK3-NEXT: andl $56, %eax +; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq %r9, %r8 +; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 +; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 +; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK3-NEXT: movq %r11, %rbx +; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 +; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK3-NEXT: movq %r14, %r15 +; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 +; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 +; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax +; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK3-NEXT: movq %r11, 48(%rdx) +; FALLBACK3-NEXT: movq %r10, 32(%rdx) +; FALLBACK3-NEXT: movq %r15, 40(%rdx) +; FALLBACK3-NEXT: movq %rdi, 16(%rdx) +; FALLBACK3-NEXT: movq %rbx, 24(%rdx) +; FALLBACK3-NEXT: movq %rsi, (%rdx) +; FALLBACK3-NEXT: movq %r8, 8(%rdx) +; FALLBACK3-NEXT: movq %rax, 56(%rdx) +; FALLBACK3-NEXT: popq %rbx +; FALLBACK3-NEXT: popq %r14 +; FALLBACK3-NEXT: popq %r15 +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: lshr_64bytes: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbp +; FALLBACK4-NEXT: pushq %r15 +; FALLBACK4-NEXT: pushq %r14 +; FALLBACK4-NEXT: pushq %r13 +; FALLBACK4-NEXT: pushq %r12 +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: pushq %rax +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK4-NEXT: movl (%rsi), %r8d +; FALLBACK4-NEXT: xorps %xmm4, %xmm4 +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: leal (,%r8,8), %eax +; FALLBACK4-NEXT: andl $56, %eax +; FALLBACK4-NEXT: andl $56, %r8d +; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10 +; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rdi +; FALLBACK4-NEXT: orq %r10, %rdi +; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10 +; FALLBACK4-NEXT: movq %r10, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12 +; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: orq %rbx, %r11 +; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx +; FALLBACK4-NEXT: movq %rbx, %r14 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r14 +; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %r14, %r10 +; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14 +; FALLBACK4-NEXT: movq %r14, %r13 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp +; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r15 +; FALLBACK4-NEXT: orq %r13, %r15 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r12 +; FALLBACK4-NEXT: addq %r14, %r14 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r14 +; FALLBACK4-NEXT: orq %r12, %r14 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %rbp +; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8 +; FALLBACK4-NEXT: leaq (%r8,%r8), %r12 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r12 +; FALLBACK4-NEXT: orq %rbp, %r12 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r9 +; FALLBACK4-NEXT: addq %rbx, %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: orq %r9, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r8 +; FALLBACK4-NEXT: movq %r8, 56(%rdx) +; FALLBACK4-NEXT: movq %rbx, 8(%rdx) +; FALLBACK4-NEXT: movq %r12, 48(%rdx) +; FALLBACK4-NEXT: movq %r14, 32(%rdx) +; FALLBACK4-NEXT: movq %r15, 40(%rdx) +; FALLBACK4-NEXT: movq %r10, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 24(%rdx) +; FALLBACK4-NEXT: movq %rdi, (%rdx) +; FALLBACK4-NEXT: addq $8, %rsp +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r12 +; FALLBACK4-NEXT: popq %r13 +; FALLBACK4-NEXT: popq %r14 +; FALLBACK4-NEXT: popq %r15 +; FALLBACK4-NEXT: popq %rbp +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: lshr_64bytes: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: pushq %r15 +; FALLBACK5-NEXT: pushq %r14 +; FALLBACK5-NEXT: pushq %rbx +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK5-NEXT: movl (%rsi), %eax +; FALLBACK5-NEXT: xorps %xmm4, %xmm4 +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: leal (,%rax,8), %ecx +; FALLBACK5-NEXT: andl $56, %ecx +; FALLBACK5-NEXT: andl $56, %eax +; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq %r9, %rsi +; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK5-NEXT: movq %r10, %r8 +; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK5-NEXT: movq %r11, %rbx +; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK5-NEXT: movq %rax, %r15 +; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK5-NEXT: shrq %cl, %r11 +; FALLBACK5-NEXT: movq %r15, 8(%rdx) +; FALLBACK5-NEXT: movq %r9, 48(%rdx) +; FALLBACK5-NEXT: movq %r11, 56(%rdx) +; FALLBACK5-NEXT: movq %rdi, 32(%rdx) +; FALLBACK5-NEXT: movq %rbx, 40(%rdx) +; FALLBACK5-NEXT: movq %r8, 16(%rdx) +; FALLBACK5-NEXT: movq %rsi, 24(%rdx) +; FALLBACK5-NEXT: movq %r14, (%rdx) +; FALLBACK5-NEXT: popq %rbx +; FALLBACK5-NEXT: popq %r14 +; FALLBACK5-NEXT: popq %r15 +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: lshr_64bytes: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: pushq %rbp +; FALLBACK6-NEXT: pushq %r15 +; FALLBACK6-NEXT: pushq %r14 +; FALLBACK6-NEXT: pushq %r13 +; FALLBACK6-NEXT: pushq %r12 +; FALLBACK6-NEXT: pushq %rbx +; FALLBACK6-NEXT: pushq %rax +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: xorps %xmm4, %xmm4 +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: leal (,%rax,8), %esi +; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: andl $56, %eax +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK6-NEXT: movl %esi, %ebx +; FALLBACK6-NEXT: notb %bl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK6-NEXT: orq %r11, %r8 +; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi +; FALLBACK6-NEXT: addq %rdi, %rdi +; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r9, %rdi +; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK6-NEXT: orq %r14, %r9 +; FALLBACK6-NEXT: addq %r10, %r10 +; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK6-NEXT: orq %r15, %r10 +; FALLBACK6-NEXT: addq %rax, %rax +; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK6-NEXT: orq %r13, %rax +; FALLBACK6-NEXT: addq %rcx, %rcx +; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK6-NEXT: orq %rbp, %rcx +; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: movq %rcx, 8(%rdx) +; FALLBACK6-NEXT: movq %rax, 48(%rdx) +; FALLBACK6-NEXT: movq %r10, 32(%rdx) +; FALLBACK6-NEXT: movq %r9, 40(%rdx) +; FALLBACK6-NEXT: movq %rdi, 16(%rdx) +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %r8, (%rdx) +; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: popq %rbx +; FALLBACK6-NEXT: popq %r12 +; FALLBACK6-NEXT: popq %r13 +; FALLBACK6-NEXT: popq %r14 +; FALLBACK6-NEXT: popq %r15 +; FALLBACK6-NEXT: popq %rbp +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: lshr_64bytes: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: pushq %r15 +; FALLBACK7-NEXT: pushq %r14 +; FALLBACK7-NEXT: pushq %rbx +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK7-NEXT: movl (%rsi), %eax +; FALLBACK7-NEXT: xorps %xmm4, %xmm4 +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: leal (,%rax,8), %ecx +; FALLBACK7-NEXT: andl $56, %ecx +; FALLBACK7-NEXT: andl $56, %eax +; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq %r9, %rsi +; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK7-NEXT: movq %r10, %r8 +; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK7-NEXT: movq %r11, %rbx +; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK7-NEXT: movq %rax, %r15 +; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10 +; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK7-NEXT: movq %r15, 8(%rdx) +; FALLBACK7-NEXT: movq %r9, 48(%rdx) +; FALLBACK7-NEXT: movq %rdi, 32(%rdx) +; FALLBACK7-NEXT: movq %rbx, 40(%rdx) +; FALLBACK7-NEXT: movq %r8, 16(%rdx) +; FALLBACK7-NEXT: movq %rsi, 24(%rdx) +; FALLBACK7-NEXT: movq %r14, (%rdx) +; FALLBACK7-NEXT: movq %r10, 56(%rdx) +; FALLBACK7-NEXT: popq %rbx +; FALLBACK7-NEXT: popq %r14 +; FALLBACK7-NEXT: popq %r15 +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: lshr_64bytes: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbp +; FALLBACK8-NEXT: pushq %r15 +; FALLBACK8-NEXT: pushq %r14 +; FALLBACK8-NEXT: pushq %r13 +; FALLBACK8-NEXT: pushq %r12 +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: pushq %rax +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK8-NEXT: movl (%rsi), %r9d +; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: leal (,%r9,8), %eax +; FALLBACK8-NEXT: andl $56, %eax +; FALLBACK8-NEXT: andl $56, %r9d +; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10 +; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rdi +; FALLBACK8-NEXT: orq %r10, %rdi +; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10 +; FALLBACK8-NEXT: movq %r10, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: orq %rbx, %r11 +; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK8-NEXT: movq %rbx, %r14 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r14 +; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %r14, %r10 +; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK8-NEXT: movq %r14, %r13 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r15 +; FALLBACK8-NEXT: orq %r13, %r15 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r12 +; FALLBACK8-NEXT: addq %r14, %r14 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r14 +; FALLBACK8-NEXT: orq %r12, %r14 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %rbp +; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9 +; FALLBACK8-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r12 +; FALLBACK8-NEXT: orq %rbp, %r12 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: addq %rbx, %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: orq %r8, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r9 +; FALLBACK8-NEXT: movq %r9, 56(%rdx) +; FALLBACK8-NEXT: movq %rbx, 8(%rdx) +; FALLBACK8-NEXT: movq %r12, 48(%rdx) +; FALLBACK8-NEXT: movq %r14, 32(%rdx) +; FALLBACK8-NEXT: movq %r15, 40(%rdx) +; FALLBACK8-NEXT: movq %r10, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 24(%rdx) +; FALLBACK8-NEXT: movq %rdi, (%rdx) +; FALLBACK8-NEXT: addq $8, %rsp +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r12 +; FALLBACK8-NEXT: popq %r13 +; FALLBACK8-NEXT: popq %r14 +; FALLBACK8-NEXT: popq %r15 +; FALLBACK8-NEXT: popq %rbp +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: lshr_64bytes: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: pushq %r15 +; FALLBACK9-NEXT: pushq %r14 +; FALLBACK9-NEXT: pushq %rbx +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: andl $56, %ecx +; FALLBACK9-NEXT: andl $56, %eax +; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq %r9, %rsi +; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: movq %r10, %r8 +; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq %r11, %rbx +; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK9-NEXT: shrq %cl, %r11 +; FALLBACK9-NEXT: movq %r15, 8(%rdx) +; FALLBACK9-NEXT: movq %r9, 48(%rdx) +; FALLBACK9-NEXT: movq %r11, 56(%rdx) +; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rbx, 40(%rdx) +; FALLBACK9-NEXT: movq %r8, 16(%rdx) +; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %r14, (%rdx) +; FALLBACK9-NEXT: popq %rbx +; FALLBACK9-NEXT: popq %r14 +; FALLBACK9-NEXT: popq %r15 +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: lshr_64bytes: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: pushq %rbp +; FALLBACK10-NEXT: pushq %r15 +; FALLBACK10-NEXT: pushq %r14 +; FALLBACK10-NEXT: pushq %r13 +; FALLBACK10-NEXT: pushq %r12 +; FALLBACK10-NEXT: pushq %rbx +; FALLBACK10-NEXT: pushq %rax +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: leal (,%rax,8), %esi +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: andl $56, %eax +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK10-NEXT: movl %esi, %ebx +; FALLBACK10-NEXT: notb %bl +; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK10-NEXT: orq %r11, %r8 +; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi +; FALLBACK10-NEXT: addq %rdi, %rdi +; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK10-NEXT: orq %r9, %rdi +; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK10-NEXT: orq %r14, %r9 +; FALLBACK10-NEXT: addq %r10, %r10 +; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK10-NEXT: orq %r15, %r10 +; FALLBACK10-NEXT: addq %rax, %rax +; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK10-NEXT: orq %r13, %rax +; FALLBACK10-NEXT: addq %rcx, %rcx +; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK10-NEXT: orq %rbp, %rcx +; FALLBACK10-NEXT: movq %rsi, 56(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %rax, 48(%rdx) +; FALLBACK10-NEXT: movq %r10, 32(%rdx) +; FALLBACK10-NEXT: movq %r9, 40(%rdx) +; FALLBACK10-NEXT: movq %rdi, 16(%rdx) +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %r8, (%rdx) +; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: popq %rbx +; FALLBACK10-NEXT: popq %r12 +; FALLBACK10-NEXT: popq %r13 +; FALLBACK10-NEXT: popq %r14 +; FALLBACK10-NEXT: popq %r15 +; FALLBACK10-NEXT: popq %rbp +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: lshr_64bytes: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: pushq %r15 +; FALLBACK11-NEXT: pushq %r14 +; FALLBACK11-NEXT: pushq %rbx +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK11-NEXT: movl (%rsi), %eax +; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: leal (,%rax,8), %ecx +; FALLBACK11-NEXT: andl $56, %ecx +; FALLBACK11-NEXT: andl $56, %eax +; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq %r9, %rsi +; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK11-NEXT: movq %r10, %r8 +; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK11-NEXT: movq %r11, %rbx +; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK11-NEXT: movq %rax, %r15 +; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10 +; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK11-NEXT: movq %r15, 8(%rdx) +; FALLBACK11-NEXT: movq %r9, 48(%rdx) +; FALLBACK11-NEXT: movq %rdi, 32(%rdx) +; FALLBACK11-NEXT: movq %rbx, 40(%rdx) +; FALLBACK11-NEXT: movq %r8, 16(%rdx) +; FALLBACK11-NEXT: movq %rsi, 24(%rdx) +; FALLBACK11-NEXT: movq %r14, (%rdx) +; FALLBACK11-NEXT: movq %r10, 56(%rdx) +; FALLBACK11-NEXT: popq %rbx +; FALLBACK11-NEXT: popq %r14 +; FALLBACK11-NEXT: popq %r15 +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: lshr_64bytes: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbp +; FALLBACK12-NEXT: pushq %r15 +; FALLBACK12-NEXT: pushq %r14 +; FALLBACK12-NEXT: pushq %r13 +; FALLBACK12-NEXT: pushq %r12 +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: pushq %rax +; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: andl $56, %eax +; FALLBACK12-NEXT: andl $56, %r9d +; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 +; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rdi +; FALLBACK12-NEXT: orq %r10, %rdi +; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 +; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: orq %rbx, %r11 +; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq %rbx, %r14 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r14 +; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %r14, %r10 +; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK12-NEXT: movq %r14, %r13 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r15 +; FALLBACK12-NEXT: orq %r13, %r15 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r12 +; FALLBACK12-NEXT: addq %r14, %r14 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r14 +; FALLBACK12-NEXT: orq %r12, %r14 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %rbp +; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 +; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r12 +; FALLBACK12-NEXT: orq %rbp, %r12 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: addq %rbx, %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: orq %r8, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r9 +; FALLBACK12-NEXT: movq %r9, 56(%rdx) +; FALLBACK12-NEXT: movq %rbx, 8(%rdx) +; FALLBACK12-NEXT: movq %r12, 48(%rdx) +; FALLBACK12-NEXT: movq %r14, 32(%rdx) +; FALLBACK12-NEXT: movq %r15, 40(%rdx) +; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 24(%rdx) +; FALLBACK12-NEXT: movq %rdi, (%rdx) +; FALLBACK12-NEXT: addq $8, %rsp +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r12 +; FALLBACK12-NEXT: popq %r13 +; FALLBACK12-NEXT: popq %r14 +; FALLBACK12-NEXT: popq %r15 +; FALLBACK12-NEXT: popq %rbp +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: lshr_64bytes: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: pushq %r15 +; FALLBACK13-NEXT: pushq %r14 +; FALLBACK13-NEXT: pushq %rbx +; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK13-NEXT: movl (%rsi), %edi +; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: leal (,%rdi,8), %ecx +; FALLBACK13-NEXT: andl $56, %ecx +; FALLBACK13-NEXT: andl $56, %edi +; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK13-NEXT: movq %r9, %rax +; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10 +; FALLBACK13-NEXT: movq %r10, %r8 +; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11 +; FALLBACK13-NEXT: movq %r11, %rbx +; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11 +; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK13-NEXT: movq %rdi, %r15 +; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14 +; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK13-NEXT: shrq %cl, %r11 +; FALLBACK13-NEXT: movq %r15, 8(%rdx) +; FALLBACK13-NEXT: movq %r9, 48(%rdx) +; FALLBACK13-NEXT: movq %r11, 56(%rdx) +; FALLBACK13-NEXT: movq %rsi, 32(%rdx) +; FALLBACK13-NEXT: movq %rbx, 40(%rdx) +; FALLBACK13-NEXT: movq %r8, 16(%rdx) +; FALLBACK13-NEXT: movq %rax, 24(%rdx) +; FALLBACK13-NEXT: movq %r14, (%rdx) +; FALLBACK13-NEXT: popq %rbx +; FALLBACK13-NEXT: popq %r14 +; FALLBACK13-NEXT: popq %r15 +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: lshr_64bytes: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: pushq %rbp +; FALLBACK14-NEXT: pushq %r15 +; FALLBACK14-NEXT: pushq %r14 +; FALLBACK14-NEXT: pushq %r13 +; FALLBACK14-NEXT: pushq %r12 +; FALLBACK14-NEXT: pushq %rbx +; FALLBACK14-NEXT: pushq %rax +; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK14-NEXT: movl (%rsi), %esi +; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: leal (,%rsi,8), %ecx +; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax +; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 +; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 +; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 +; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 +; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 +; FALLBACK14-NEXT: movl %ecx, %ebx +; FALLBACK14-NEXT: notb %bl +; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp +; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK14-NEXT: orq %r11, %r8 +; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 +; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 +; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp +; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: addq %rdi, %rdi +; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK14-NEXT: orq %r9, %rdi +; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK14-NEXT: orq %r14, %r9 +; FALLBACK14-NEXT: addq %r10, %r10 +; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK14-NEXT: orq %r15, %r10 +; FALLBACK14-NEXT: addq %rsi, %rsi +; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi +; FALLBACK14-NEXT: orq %r13, %rsi +; FALLBACK14-NEXT: addq %rax, %rax +; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK14-NEXT: orq %rbp, %rax +; FALLBACK14-NEXT: movq %rcx, 56(%rdx) +; FALLBACK14-NEXT: movq %rax, 8(%rdx) +; FALLBACK14-NEXT: movq %rsi, 48(%rdx) +; FALLBACK14-NEXT: movq %r10, 32(%rdx) +; FALLBACK14-NEXT: movq %r9, 40(%rdx) +; FALLBACK14-NEXT: movq %rdi, 16(%rdx) +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %r8, (%rdx) +; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: popq %rbx +; FALLBACK14-NEXT: popq %r12 +; FALLBACK14-NEXT: popq %r13 +; FALLBACK14-NEXT: popq %r14 +; FALLBACK14-NEXT: popq %r15 +; FALLBACK14-NEXT: popq %rbp +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: lshr_64bytes: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: pushq %r15 +; FALLBACK15-NEXT: pushq %r14 +; FALLBACK15-NEXT: pushq %rbx +; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: andl $56, %ecx +; FALLBACK15-NEXT: andl $56, %eax +; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq %r9, %rsi +; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: movq %r10, %r8 +; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq %r11, %rbx +; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 +; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: movq %r15, 8(%rdx) +; FALLBACK15-NEXT: movq %r9, 48(%rdx) +; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rbx, 40(%rdx) +; FALLBACK15-NEXT: movq %r8, 16(%rdx) +; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %r14, (%rdx) +; FALLBACK15-NEXT: movq %r10, 56(%rdx) +; FALLBACK15-NEXT: popq %rbx +; FALLBACK15-NEXT: popq %r14 +; FALLBACK15-NEXT: popq %r15 +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; FALLBACK16-LABEL: lshr_64bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $204, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl (%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 8(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 12(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 16(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 20(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 24(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 28(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 32(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 40(%eax), %ebp +; FALLBACK16-NEXT: movl 44(%eax), %ebx +; FALLBACK16-NEXT: movl 48(%eax), %edi +; FALLBACK16-NEXT: movl 52(%eax), %esi +; FALLBACK16-NEXT: movl 56(%eax), %edx +; FALLBACK16-NEXT: movl 60(%eax), %ecx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl (%eax), %eax +; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %eax, %esi +; FALLBACK16-NEXT: andl $60, %esi +; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK16-NEXT: shll $3, %eax +; FALLBACK16-NEXT: andl $24, %eax +; FALLBACK16-NEXT: movl %edx, %edi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK16-NEXT: movb %al, %ch +; FALLBACK16-NEXT: notb %ch +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %edi, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK16-NEXT: movl %edx, %ebp +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %ebp, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %ebx, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK16-NEXT: movl %ebx, %ebp +; FALLBACK16-NEXT: movl %eax, %edx +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %eax, %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %ebp, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK16-NEXT: movl %ebx, %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %ebp, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %eax, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK16-NEXT: movl %ebx, %ebp +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK16-NEXT: leal (%edx,%edx), %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %ebp, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK16-NEXT: movl %edi, %ebp +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %ebp, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %edx, %edi +; FALLBACK16-NEXT: movl %esi, %edx +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK16-NEXT: movl %esi, %ebx +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK16-NEXT: leal (%eax,%eax), %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %ebx, %esi +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %eax, %edx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl %ebx, 60(%eax) +; FALLBACK16-NEXT: movl %edx, 56(%eax) +; FALLBACK16-NEXT: movl %esi, 48(%eax) +; FALLBACK16-NEXT: movl %ebp, 52(%eax) +; FALLBACK16-NEXT: movl %edi, 40(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 44(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 32(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 36(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 24(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 28(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 16(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 20(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 12(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, (%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 4(%eax) +; FALLBACK16-NEXT: addl $204, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: lshr_64bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $188, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl (%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 8(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 12(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 16(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 20(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 24(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 28(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 32(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 36(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%ecx), %ebp +; FALLBACK17-NEXT: movl 44(%ecx), %ebx +; FALLBACK17-NEXT: movl 48(%ecx), %edi +; FALLBACK17-NEXT: movl 52(%ecx), %esi +; FALLBACK17-NEXT: movl 56(%ecx), %edx +; FALLBACK17-NEXT: movl 60(%ecx), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl (%ecx), %ecx +; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ecx, %ebp +; FALLBACK17-NEXT: andl $60, %ebp +; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shll $3, %ecx +; FALLBACK17-NEXT: andl $24, %ecx +; FALLBACK17-NEXT: shrdl %cl, %edx, %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %edi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %esi, %edx +; FALLBACK17-NEXT: shrdl %cl, %eax, %edi +; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edi +; FALLBACK17-NEXT: shrdl %cl, %esi, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edi +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl %edx, 56(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK17-NEXT: shrl %cl, %eax +; FALLBACK17-NEXT: movl %eax, 60(%ebp) +; FALLBACK17-NEXT: movl %esi, 48(%ebp) +; FALLBACK17-NEXT: movl %edi, 52(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 40(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 44(%ebp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 32(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 36(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 24(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 16(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 20(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ebp) +; FALLBACK17-NEXT: movl %ebx, (%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: addl $188, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: lshr_64bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $204, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 12(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 16(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 20(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 24(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 28(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 36(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%eax), %ebp +; FALLBACK18-NEXT: movl 44(%eax), %ebx +; FALLBACK18-NEXT: movl 48(%eax), %edi +; FALLBACK18-NEXT: movl 52(%eax), %esi +; FALLBACK18-NEXT: movl 56(%eax), %edx +; FALLBACK18-NEXT: movl 60(%eax), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: xorps %xmm0, %xmm0 +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: andl $60, %ecx +; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %edi +; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: leal (%eax,%eax), %ebp +; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: orl %edi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: orl %edi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %eax, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %eax, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl %ecx, %edi +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx +; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax +; FALLBACK18-NEXT: shrxl %edx, %eax, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %eax, %eax +; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp +; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx +; FALLBACK18-NEXT: addl %ebp, %ebp +; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx +; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl %edx, 60(%eax) +; FALLBACK18-NEXT: movl %ebx, 56(%eax) +; FALLBACK18-NEXT: movl %edi, 48(%eax) +; FALLBACK18-NEXT: movl %ecx, 52(%eax) +; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 44(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 32(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 36(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 24(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 28(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 16(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 20(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 8(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl $204, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: lshr_64bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $188, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl (%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 12(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 16(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 20(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 24(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 28(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 32(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 36(%ecx), %eax +; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%ecx), %ebp +; FALLBACK19-NEXT: movl 44(%ecx), %ebx +; FALLBACK19-NEXT: movl 48(%ecx), %edi +; FALLBACK19-NEXT: movl 52(%ecx), %esi +; FALLBACK19-NEXT: movl 56(%ecx), %edx +; FALLBACK19-NEXT: movl 60(%ecx), %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl (%ecx), %ecx +; FALLBACK19-NEXT: xorps %xmm0, %xmm0 +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ecx, %ebp +; FALLBACK19-NEXT: andl $60, %ebp +; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shll $3, %ecx +; FALLBACK19-NEXT: andl $24, %ecx +; FALLBACK19-NEXT: shrdl %cl, %edx, %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %esi +; FALLBACK19-NEXT: shrdl %cl, %edi, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %esi, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %edi, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %edi +; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %esi, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl %edi, %edx +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %edi, %esi +; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl %eax, 56(%ebp) +; FALLBACK19-NEXT: movl %esi, 48(%ebp) +; FALLBACK19-NEXT: movl %edx, 52(%ebp) +; FALLBACK19-NEXT: movl %ebx, 40(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 44(%ebp) +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 32(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 36(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 24(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 28(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 16(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 20(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 8(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: shrdl %cl, %edx, %edi +; FALLBACK19-NEXT: movl %edi, (%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 4(%ebp) +; FALLBACK19-NEXT: movl %eax, 60(%ebp) +; FALLBACK19-NEXT: addl $188, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: lshr_64bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $204, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK20-NEXT: movl (%eax), %eax +; FALLBACK20-NEXT: xorps %xmm4, %xmm4 +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %eax, %esi +; FALLBACK20-NEXT: andl $60, %esi +; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK20-NEXT: shll $3, %eax +; FALLBACK20-NEXT: andl $24, %eax +; FALLBACK20-NEXT: movl %edx, %edi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movb %al, %ch +; FALLBACK20-NEXT: notb %ch +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %edi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %edi, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %edx, %ebp +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %ebx, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %edi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %eax, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK20-NEXT: leal (%edx,%edx), %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %edi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK20-NEXT: movl %edi, %ebp +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %edx, %edi +; FALLBACK20-NEXT: movl %esi, %edx +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK20-NEXT: movl %esi, %ebx +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK20-NEXT: leal (%eax,%eax), %ebp +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %eax, %edx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl %ebx, 60(%eax) +; FALLBACK20-NEXT: movl %edx, 56(%eax) +; FALLBACK20-NEXT: movl %esi, 48(%eax) +; FALLBACK20-NEXT: movl %ebp, 52(%eax) +; FALLBACK20-NEXT: movl %edi, 40(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 44(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 32(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 36(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 24(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 28(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 16(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 20(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 8(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 12(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, (%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 4(%eax) +; FALLBACK20-NEXT: addl $204, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: lshr_64bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $188, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK21-NEXT: movl (%eax), %ecx +; FALLBACK21-NEXT: xorps %xmm4, %xmm4 +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %ecx, %ebp +; FALLBACK21-NEXT: andl $60, %ebp +; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shll $3, %ecx +; FALLBACK21-NEXT: andl $24, %ecx +; FALLBACK21-NEXT: shrdl %cl, %edx, %eax +; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %esi +; FALLBACK21-NEXT: shrdl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %esi, %edx +; FALLBACK21-NEXT: shrdl %cl, %eax, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edi +; FALLBACK21-NEXT: shrdl %cl, %esi, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edi +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %edx, 56(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK21-NEXT: shrl %cl, %eax +; FALLBACK21-NEXT: movl %eax, 60(%ebp) +; FALLBACK21-NEXT: movl %esi, 48(%ebp) +; FALLBACK21-NEXT: movl %edi, 52(%ebp) +; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 40(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 44(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 32(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 36(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 24(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 12(%ebp) +; FALLBACK21-NEXT: movl %ebx, (%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 4(%ebp) +; FALLBACK21-NEXT: addl $188, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: lshr_64bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $204, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK22-NEXT: movl (%eax), %ecx +; FALLBACK22-NEXT: xorps %xmm4, %xmm4 +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: leal (,%ecx,8), %edx +; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: andl $60, %ecx +; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %edi +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: leal (%eax,%eax), %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %eax, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %eax, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx +; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp +; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax +; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ecx +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp +; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx +; FALLBACK22-NEXT: addl %ebp, %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx +; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl %edx, 60(%eax) +; FALLBACK22-NEXT: movl %ebx, 56(%eax) +; FALLBACK22-NEXT: movl %edi, 48(%eax) +; FALLBACK22-NEXT: movl %ecx, 52(%eax) +; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 44(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 32(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 36(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 24(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 28(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 16(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 20(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 8(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 12(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, (%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: addl $204, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: lshr_64bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $188, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK23-NEXT: movl (%eax), %ecx +; FALLBACK23-NEXT: xorps %xmm4, %xmm4 +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %ecx, %ebp +; FALLBACK23-NEXT: andl $60, %ebp +; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shll $3, %ecx +; FALLBACK23-NEXT: andl $24, %ecx +; FALLBACK23-NEXT: shrdl %cl, %edx, %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %esi +; FALLBACK23-NEXT: shrdl %cl, %edi, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %edi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl %edi, %edx +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %edi, %esi +; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK23-NEXT: movl %eax, 56(%ebp) +; FALLBACK23-NEXT: movl %esi, 48(%ebp) +; FALLBACK23-NEXT: movl %edx, 52(%ebp) +; FALLBACK23-NEXT: movl %ebx, 40(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 44(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 32(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 36(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 24(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 28(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 16(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 20(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 8(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 12(%ebp) +; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl %edi, (%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 4(%ebp) +; FALLBACK23-NEXT: movl %eax, 60(%ebp) +; FALLBACK23-NEXT: addl $188, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: lshr_64bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $204, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK24-NEXT: movl (%eax), %ecx +; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, %esi +; FALLBACK24-NEXT: andl $60, %esi +; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK24-NEXT: shll $3, %ecx +; FALLBACK24-NEXT: andl $24, %ecx +; FALLBACK24-NEXT: movl %edx, %edi +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%eax,%eax), %ebx +; FALLBACK24-NEXT: movl %ecx, %ebp +; FALLBACK24-NEXT: movb %cl, %ch +; FALLBACK24-NEXT: notb %ch +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %edi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK24-NEXT: movl %ebp, %eax +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %edi, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %edx, %ebp +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %ebx, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %edi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %eax, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK24-NEXT: leal (%edx,%edx), %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %edi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK24-NEXT: movl %edi, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %edx, %edi +; FALLBACK24-NEXT: movl %esi, %edx +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK24-NEXT: movl %esi, %ebx +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK24-NEXT: leal (%eax,%eax), %ebp +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %eax, %edx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl %ebx, 60(%eax) +; FALLBACK24-NEXT: movl %edx, 56(%eax) +; FALLBACK24-NEXT: movl %esi, 48(%eax) +; FALLBACK24-NEXT: movl %ebp, 52(%eax) +; FALLBACK24-NEXT: movl %edi, 40(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 44(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 32(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 36(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 24(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 28(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 16(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 20(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 8(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 12(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, (%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 4(%eax) +; FALLBACK24-NEXT: addl $204, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: vzeroupper +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: lshr_64bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $188, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK25-NEXT: movl (%eax), %ecx +; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %ecx, %ebp +; FALLBACK25-NEXT: andl $60, %ebp +; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shll $3, %ecx +; FALLBACK25-NEXT: andl $24, %ecx +; FALLBACK25-NEXT: shrdl %cl, %edx, %eax +; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %esi +; FALLBACK25-NEXT: shrdl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %esi, %edx +; FALLBACK25-NEXT: shrdl %cl, %eax, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edi +; FALLBACK25-NEXT: shrdl %cl, %esi, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edi +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %edx, 56(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK25-NEXT: shrl %cl, %eax +; FALLBACK25-NEXT: movl %eax, 60(%ebp) +; FALLBACK25-NEXT: movl %esi, 48(%ebp) +; FALLBACK25-NEXT: movl %edi, 52(%ebp) +; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 40(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 44(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 32(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 36(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 24(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 12(%ebp) +; FALLBACK25-NEXT: movl %ebx, (%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 4(%ebp) +; FALLBACK25-NEXT: addl $188, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: vzeroupper +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: lshr_64bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $204, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK26-NEXT: movl (%eax), %ecx +; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: leal (,%ecx,8), %edx +; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: andl $60, %ecx +; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %edi +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: leal (%eax,%eax), %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %eax, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %eax, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp +; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax +; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx +; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx +; FALLBACK26-NEXT: addl %ecx, %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx +; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: movl %edx, 60(%ecx) +; FALLBACK26-NEXT: movl %ebx, 56(%ecx) +; FALLBACK26-NEXT: movl %edi, 48(%ecx) +; FALLBACK26-NEXT: movl %esi, 52(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 40(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 44(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 32(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 36(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 24(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 28(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 16(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, (%ecx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 4(%ecx) +; FALLBACK26-NEXT: addl $204, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: vzeroupper +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: lshr_64bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $188, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK27-NEXT: movl (%eax), %ecx +; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %ecx, %ebp +; FALLBACK27-NEXT: andl $60, %ebp +; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shll $3, %ecx +; FALLBACK27-NEXT: andl $24, %ecx +; FALLBACK27-NEXT: shrdl %cl, %edx, %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %esi +; FALLBACK27-NEXT: shrdl %cl, %edi, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %edi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl %edi, %edx +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %edi, %esi +; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK27-NEXT: movl %eax, 56(%ebp) +; FALLBACK27-NEXT: movl %esi, 48(%ebp) +; FALLBACK27-NEXT: movl %edx, 52(%ebp) +; FALLBACK27-NEXT: movl %ebx, 40(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 44(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 32(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 36(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 24(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 28(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 16(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 20(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 8(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 12(%ebp) +; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl %edi, (%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 4(%ebp) +; FALLBACK27-NEXT: movl %eax, 60(%ebp) +; FALLBACK27-NEXT: addl $188, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: vzeroupper +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: lshr_64bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $204, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK28-NEXT: movl (%eax), %ecx +; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, %esi +; FALLBACK28-NEXT: andl $60, %esi +; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK28-NEXT: shll $3, %ecx +; FALLBACK28-NEXT: andl $24, %ecx +; FALLBACK28-NEXT: movl %edx, %edi +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%eax,%eax), %ebx +; FALLBACK28-NEXT: movl %ecx, %ebp +; FALLBACK28-NEXT: movb %cl, %ch +; FALLBACK28-NEXT: notb %ch +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %edi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK28-NEXT: movl %ebp, %eax +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %edi, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %edx, %ebp +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %ebx, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %edi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %eax, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK28-NEXT: leal (%edx,%edx), %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %edi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK28-NEXT: movl %edi, %ebp +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %edx, %edi +; FALLBACK28-NEXT: movl %esi, %edx +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK28-NEXT: movl %esi, %ebx +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK28-NEXT: leal (%eax,%eax), %ebp +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %eax, %edx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl %ebx, 60(%eax) +; FALLBACK28-NEXT: movl %edx, 56(%eax) +; FALLBACK28-NEXT: movl %esi, 48(%eax) +; FALLBACK28-NEXT: movl %ebp, 52(%eax) +; FALLBACK28-NEXT: movl %edi, 40(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 44(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 32(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 36(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 24(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 28(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 16(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 20(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 8(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 12(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, (%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 4(%eax) +; FALLBACK28-NEXT: addl $204, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: vzeroupper +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: lshr_64bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $188, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK29-NEXT: movl (%eax), %ecx +; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %ecx, %ebp +; FALLBACK29-NEXT: andl $60, %ebp +; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shll $3, %ecx +; FALLBACK29-NEXT: andl $24, %ecx +; FALLBACK29-NEXT: shrdl %cl, %edx, %eax +; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %esi +; FALLBACK29-NEXT: shrdl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %esi, %edx +; FALLBACK29-NEXT: shrdl %cl, %eax, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edi +; FALLBACK29-NEXT: shrdl %cl, %esi, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edi +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %edx, 56(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK29-NEXT: shrl %cl, %eax +; FALLBACK29-NEXT: movl %eax, 60(%ebp) +; FALLBACK29-NEXT: movl %esi, 48(%ebp) +; FALLBACK29-NEXT: movl %edi, 52(%ebp) +; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 40(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 44(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 32(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 36(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 24(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 12(%ebp) +; FALLBACK29-NEXT: movl %ebx, (%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 4(%ebp) +; FALLBACK29-NEXT: addl $188, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: vzeroupper +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: lshr_64bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $204, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK30-NEXT: movl (%eax), %edx +; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: leal (,%edx,8), %ecx +; FALLBACK30-NEXT: andl $24, %ecx +; FALLBACK30-NEXT: andl $60, %edx +; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK30-NEXT: movl %ecx, %ebx +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: leal (%eax,%eax), %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %eax, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %eax, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi +; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp +; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax +; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax +; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx +; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp +; FALLBACK30-NEXT: leal (%edx,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx +; FALLBACK30-NEXT: orl %eax, %edx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: movl %ebp, 60(%ecx) +; FALLBACK30-NEXT: movl %edx, 56(%ecx) +; FALLBACK30-NEXT: movl %edi, 48(%ecx) +; FALLBACK30-NEXT: movl %esi, 52(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 40(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 44(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 32(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 36(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 24(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 28(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 16(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, (%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 4(%ecx) +; FALLBACK30-NEXT: addl $204, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: vzeroupper +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: lshr_64bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $188, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK31-NEXT: movl (%eax), %ecx +; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %ecx, %ebp +; FALLBACK31-NEXT: andl $60, %ebp +; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shll $3, %ecx +; FALLBACK31-NEXT: andl $24, %ecx +; FALLBACK31-NEXT: shrdl %cl, %edx, %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %esi +; FALLBACK31-NEXT: shrdl %cl, %edi, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %edi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl %edi, %edx +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %edi, %esi +; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK31-NEXT: movl %eax, 56(%ebp) +; FALLBACK31-NEXT: movl %esi, 48(%ebp) +; FALLBACK31-NEXT: movl %edx, 52(%ebp) +; FALLBACK31-NEXT: movl %ebx, 40(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 44(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 32(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 36(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 24(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 28(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 16(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 20(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 8(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 12(%ebp) +; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl %edi, (%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 4(%ebp) +; FALLBACK31-NEXT: movl %eax, 60(%ebp) +; FALLBACK31-NEXT: addl $188, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: vzeroupper +; FALLBACK31-NEXT: retl + %src = load i512, ptr %src.ptr, align 1 + %byteOff = load i512, ptr %byteOff.ptr, align 1 + %bitOff = shl i512 %byteOff, 3 + %res = lshr i512 %src, %bitOff + store i512 %res, ptr %dst, align 1 + ret void +} + +define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { +; X64-SSE2-LABEL: lshr_64bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rbx ; X64-SSE2-NEXT: movq (%rdi), %rax @@ -1667,6 +15700,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq 48(%rdi), %rbx ; X64-SSE2-NEXT: movq 56(%rdi), %rdi ; X64-SSE2-NEXT: movl (%rsi), %esi +; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -1675,23 +15713,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $63, %esi -; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax -; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: andl $7, %esi +; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax +; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx +; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi +; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8 +; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9 +; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10 +; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11 +; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 48(%rdx) ; X64-SSE2-NEXT: movq %r11, 56(%rdx) ; X64-SSE2-NEXT: movq %r10, 32(%rdx) @@ -1703,35 +15733,38 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; -; X64-SSE42-LABEL: lshr_64bytes: +; X64-SSE42-LABEL: lshr_64bytes_qwordOff: ; X64-SSE42: # %bb.0: +; X64-SSE42-NEXT: pushq %rax ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 ; X64-SSE42-NEXT: movups 48(%rdi), %xmm3 ; X64-SSE42-NEXT: movl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm4, %xmm4 -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: andl $63, %eax -; X64-SSE42-NEXT: movups -128(%rsp,%rax), %xmm0 -; X64-SSE42-NEXT: movups -112(%rsp,%rax), %xmm1 -; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2 -; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3 +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: andl $7, %eax +; X64-SSE42-NEXT: movups -128(%rsp,%rax,8), %xmm0 +; X64-SSE42-NEXT: movups -112(%rsp,%rax,8), %xmm1 +; X64-SSE42-NEXT: movups -96(%rsp,%rax,8), %xmm2 +; X64-SSE42-NEXT: movups -80(%rsp,%rax,8), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) +; X64-SSE42-NEXT: popq %rax ; X64-SSE42-NEXT: retq ; -; X64-AVX1-LABEL: lshr_64bytes: +; X64-AVX1-LABEL: lshr_64bytes_qwordOff: ; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 ; X64-AVX1-NEXT: movl (%rsi), %eax @@ -1740,44 +15773,47 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: andl $63, %eax -; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3 +; X64-AVX1-NEXT: andl $7, %eax +; X64-AVX1-NEXT: vmovups -128(%rsp,%rax,8), %xmm0 +; X64-AVX1-NEXT: vmovups -112(%rsp,%rax,8), %xmm1 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rax,8), %xmm2 +; X64-AVX1-NEXT: vmovups -80(%rsp,%rax,8), %xmm3 ; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: popq %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; -; X64-AVX512-LABEL: lshr_64bytes: +; X64-AVX512-LABEL: lshr_64bytes_qwordOff: ; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: pushq %rax ; X64-AVX512-NEXT: vmovups (%rdi), %zmm0 ; X64-AVX512-NEXT: movl (%rsi), %eax ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: andl $63, %eax -; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3 +; X64-AVX512-NEXT: andl $7, %eax +; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0 +; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1 +; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2 +; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3 ; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: popq %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; -; X86-SSE2-LABEL: lshr_64bytes: +; X86-SSE2-LABEL: lshr_64bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $168, %esp +; X86-SSE2-NEXT: subl $188, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -1798,7 +15834,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 32(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 40(%eax), %ebp ; X86-SSE2-NEXT: movl 44(%eax), %ebx ; X86-SSE2-NEXT: movl 48(%eax), %edi @@ -1807,13 +15843,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 60(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %eax +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -1821,6 +15861,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1833,49 +15874,33 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $63, %eax -; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx +; X86-SSE2-NEXT: andl $7, %eax +; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp +; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx +; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi +; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi +; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx +; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 56(%eax) ; X86-SSE2-NEXT: movl %edx, 60(%eax) @@ -1883,7 +15908,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) @@ -1903,16 +15928,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $168, %esp +; X86-SSE2-NEXT: addl $188, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: lshr_64bytes: +; X86-SSE42-LABEL: lshr_64bytes_qwordOff: ; X86-SSE42: # %bb.0: -; X86-SSE42-NEXT: subl $128, %esp +; X86-SSE42-NEXT: subl $140, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1922,29 +15947,29 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movups 48(%edx), %xmm3 ; X86-SSE42-NEXT: movl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm4, %xmm4 -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, (%esp) -; X86-SSE42-NEXT: andl $63, %ecx -; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0 -; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1 -; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2 -; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3 +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) +; X86-SSE42-NEXT: andl $7, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 +; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2 +; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax) ; X86-SSE42-NEXT: movups %xmm2, 32(%eax) ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) -; X86-SSE42-NEXT: addl $128, %esp +; X86-SSE42-NEXT: addl $140, %esp ; X86-SSE42-NEXT: retl ; -; X86-AVX1-LABEL: lshr_64bytes: +; X86-AVX1-LABEL: lshr_64bytes_qwordOff: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: subl $128, %esp +; X86-AVX1-NEXT: subl $140, %esp ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1956,22 +15981,22 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm0, (%esp) -; X86-AVX1-NEXT: andl $63, %ecx -; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3 +; X86-AVX1-NEXT: andl $7, %ecx +; X86-AVX1-NEXT: vmovups (%esp,%ecx,8), %xmm0 +; X86-AVX1-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 +; X86-AVX1-NEXT: vmovups 32(%esp,%ecx,8), %xmm2 +; X86-AVX1-NEXT: vmovups 48(%esp,%ecx,8), %xmm3 ; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX1-NEXT: vmovups %xmm0, (%eax) -; X86-AVX1-NEXT: addl $128, %esp +; X86-AVX1-NEXT: addl $140, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; -; X86-AVX512-LABEL: lshr_64bytes: +; X86-AVX512-LABEL: lshr_64bytes_qwordOff: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: subl $128, %esp +; X86-AVX512-NEXT: subl $140, %esp ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1980,27 +16005,3801 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: vmovups %zmm0, (%esp) -; X86-AVX512-NEXT: andl $63, %ecx -; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3 +; X86-AVX512-NEXT: andl $7, %ecx +; X86-AVX512-NEXT: vmovups (%esp,%ecx,8), %xmm0 +; X86-AVX512-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 +; X86-AVX512-NEXT: vmovups 32(%esp,%ecx,8), %xmm2 +; X86-AVX512-NEXT: vmovups 48(%esp,%ecx,8), %xmm3 ; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX512-NEXT: vmovups %xmm0, (%eax) -; X86-AVX512-NEXT: addl $128, %esp +; X86-AVX512-NEXT: addl $140, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 - %byteOff = load i512, ptr %byteOff.ptr, align 1 - %bitOff = shl i512 %byteOff, 3 + %qwordOff = load i512, ptr %qwordOff.ptr, align 1 + %bitOff = shl i512 %qwordOff, 6 %res = lshr i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } + define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; X64-SSE2-LABEL: shl_64bytes: +; FALLBACK0-LABEL: shl_64bytes: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r15 +; FALLBACK0-NEXT: pushq %r14 +; FALLBACK0-NEXT: pushq %r13 +; FALLBACK0-NEXT: pushq %r12 +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rax +; FALLBACK0-NEXT: movq 8(%rdi), %rcx +; FALLBACK0-NEXT: movq 16(%rdi), %r8 +; FALLBACK0-NEXT: movq 24(%rdi), %r9 +; FALLBACK0-NEXT: movq 32(%rdi), %r10 +; FALLBACK0-NEXT: movq 40(%rdi), %r11 +; FALLBACK0-NEXT: movq 48(%rdi), %rbx +; FALLBACK0-NEXT: movq 56(%rdi), %rdi +; FALLBACK0-NEXT: movl (%rsi), %esi +; FALLBACK0-NEXT: xorps %xmm0, %xmm0 +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: leal (,%rsi,8), %eax +; FALLBACK0-NEXT: andl $56, %eax +; FALLBACK0-NEXT: andl $56, %esi +; FALLBACK0-NEXT: negl %esi +; FALLBACK0-NEXT: movslq %esi, %rbx +; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8 +; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi +; FALLBACK0-NEXT: movq %rdi, %r10 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq %r8, %r9 +; FALLBACK0-NEXT: shrq %r9 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r9 +; FALLBACK0-NEXT: orq %r10, %r9 +; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10 +; FALLBACK0-NEXT: movq %r10, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15 +; FALLBACK0-NEXT: movq %r15, %r11 +; FALLBACK0-NEXT: shrq %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: orq %r14, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r15 +; FALLBACK0-NEXT: shrq %rdi +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %rdi +; FALLBACK0-NEXT: orq %r15, %rdi +; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14 +; FALLBACK0-NEXT: movq %r14, %r12 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r12 +; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13 +; FALLBACK0-NEXT: movq %r13, %r15 +; FALLBACK0-NEXT: shrq %r15 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r15 +; FALLBACK0-NEXT: orq %r12, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r13 +; FALLBACK0-NEXT: shrq %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: orq %r13, %r10 +; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r12 +; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx +; FALLBACK0-NEXT: movq %rbx, %r13 +; FALLBACK0-NEXT: shrq %r13 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r13 +; FALLBACK0-NEXT: orq %r12, %r13 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %rbx +; FALLBACK0-NEXT: shrq %r14 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shrq %cl, %r14 +; FALLBACK0-NEXT: orq %rbx, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: movq %r8, (%rdx) +; FALLBACK0-NEXT: movq %r14, 48(%rdx) +; FALLBACK0-NEXT: movq %r13, 56(%rdx) +; FALLBACK0-NEXT: movq %r10, 32(%rdx) +; FALLBACK0-NEXT: movq %r15, 40(%rdx) +; FALLBACK0-NEXT: movq %rdi, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 24(%rdx) +; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r12 +; FALLBACK0-NEXT: popq %r13 +; FALLBACK0-NEXT: popq %r14 +; FALLBACK0-NEXT: popq %r15 +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: shl_64bytes: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: pushq %r14 +; FALLBACK1-NEXT: pushq %rbx +; FALLBACK1-NEXT: pushq %rax +; FALLBACK1-NEXT: movq (%rdi), %rax +; FALLBACK1-NEXT: movq 8(%rdi), %rcx +; FALLBACK1-NEXT: movq 16(%rdi), %r8 +; FALLBACK1-NEXT: movq 24(%rdi), %r9 +; FALLBACK1-NEXT: movq 32(%rdi), %r10 +; FALLBACK1-NEXT: movq 40(%rdi), %r11 +; FALLBACK1-NEXT: movq 48(%rdi), %rbx +; FALLBACK1-NEXT: movq 56(%rdi), %rdi +; FALLBACK1-NEXT: movl (%rsi), %esi +; FALLBACK1-NEXT: xorps %xmm0, %xmm0 +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: leal (,%rsi,8), %ecx +; FALLBACK1-NEXT: andl $56, %ecx +; FALLBACK1-NEXT: andl $56, %esi +; FALLBACK1-NEXT: negl %esi +; FALLBACK1-NEXT: movslq %esi, %r9 +; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax +; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10 +; FALLBACK1-NEXT: movq %r10, %rsi +; FALLBACK1-NEXT: shldq %cl, %rax, %rsi +; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8 +; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi +; FALLBACK1-NEXT: shldq %cl, %rdi, %rax +; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11 +; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx +; FALLBACK1-NEXT: movq %rbx, %r14 +; FALLBACK1-NEXT: shldq %cl, %r11, %r14 +; FALLBACK1-NEXT: shldq %cl, %r10, %r11 +; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10 +; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK1-NEXT: shldq %cl, %r10, %r9 +; FALLBACK1-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK1-NEXT: shldq %cl, %r8, %rdi +; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK1-NEXT: shlq %cl, %r8 +; FALLBACK1-NEXT: movq %r10, 48(%rdx) +; FALLBACK1-NEXT: movq %r9, 56(%rdx) +; FALLBACK1-NEXT: movq %r11, 32(%rdx) +; FALLBACK1-NEXT: movq %r14, 40(%rdx) +; FALLBACK1-NEXT: movq %rax, 16(%rdx) +; FALLBACK1-NEXT: movq %rsi, 24(%rdx) +; FALLBACK1-NEXT: movq %r8, (%rdx) +; FALLBACK1-NEXT: movq %rdi, 8(%rdx) +; FALLBACK1-NEXT: addq $8, %rsp +; FALLBACK1-NEXT: popq %rbx +; FALLBACK1-NEXT: popq %r14 +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: shl_64bytes: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: pushq %rbp +; FALLBACK2-NEXT: pushq %r15 +; FALLBACK2-NEXT: pushq %r14 +; FALLBACK2-NEXT: pushq %r13 +; FALLBACK2-NEXT: pushq %r12 +; FALLBACK2-NEXT: pushq %rbx +; FALLBACK2-NEXT: pushq %rax +; FALLBACK2-NEXT: movq (%rdi), %rax +; FALLBACK2-NEXT: movq 8(%rdi), %rcx +; FALLBACK2-NEXT: movq 16(%rdi), %r8 +; FALLBACK2-NEXT: movq 24(%rdi), %r9 +; FALLBACK2-NEXT: movq 32(%rdi), %r10 +; FALLBACK2-NEXT: movq 40(%rdi), %r11 +; FALLBACK2-NEXT: movq 48(%rdi), %rbx +; FALLBACK2-NEXT: movq 56(%rdi), %rdi +; FALLBACK2-NEXT: movl (%rsi), %esi +; FALLBACK2-NEXT: xorps %xmm0, %xmm0 +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: leal (,%rsi,8), %eax +; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: andl $56, %esi +; FALLBACK2-NEXT: negl %esi +; FALLBACK2-NEXT: movslq %esi, %rsi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx +; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 +; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 +; FALLBACK2-NEXT: movl %eax, %r13d +; FALLBACK2-NEXT: notb %r13b +; FALLBACK2-NEXT: shrq %r10 +; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp +; FALLBACK2-NEXT: shrq %r14 +; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 +; FALLBACK2-NEXT: orq %r11, %r14 +; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 +; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax +; FALLBACK2-NEXT: shrq %rcx +; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: shrq %r9 +; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 +; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: shrq %rdi +; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi +; FALLBACK2-NEXT: orq %rbp, %rdi +; FALLBACK2-NEXT: shrq %rsi +; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi +; FALLBACK2-NEXT: orq %r11, %rsi +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 +; FALLBACK2-NEXT: orq %rax, %r8 +; FALLBACK2-NEXT: movq %r12, (%rdx) +; FALLBACK2-NEXT: movq %r8, 48(%rdx) +; FALLBACK2-NEXT: movq %rsi, 56(%rdx) +; FALLBACK2-NEXT: movq %rdi, 32(%rdx) +; FALLBACK2-NEXT: movq %r9, 40(%rdx) +; FALLBACK2-NEXT: movq %rcx, 16(%rdx) +; FALLBACK2-NEXT: movq %r14, 24(%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) +; FALLBACK2-NEXT: addq $8, %rsp +; FALLBACK2-NEXT: popq %rbx +; FALLBACK2-NEXT: popq %r12 +; FALLBACK2-NEXT: popq %r13 +; FALLBACK2-NEXT: popq %r14 +; FALLBACK2-NEXT: popq %r15 +; FALLBACK2-NEXT: popq %rbp +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: shl_64bytes: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: pushq %r14 +; FALLBACK3-NEXT: pushq %rbx +; FALLBACK3-NEXT: pushq %rax +; FALLBACK3-NEXT: movq (%rdi), %rax +; FALLBACK3-NEXT: movq 8(%rdi), %rcx +; FALLBACK3-NEXT: movq 16(%rdi), %r8 +; FALLBACK3-NEXT: movq 24(%rdi), %r9 +; FALLBACK3-NEXT: movq 32(%rdi), %r10 +; FALLBACK3-NEXT: movq 40(%rdi), %r11 +; FALLBACK3-NEXT: movq 48(%rdi), %rbx +; FALLBACK3-NEXT: movq 56(%rdi), %rdi +; FALLBACK3-NEXT: movl (%rsi), %esi +; FALLBACK3-NEXT: xorps %xmm0, %xmm0 +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: leal (,%rsi,8), %ecx +; FALLBACK3-NEXT: andl $56, %ecx +; FALLBACK3-NEXT: andl $56, %esi +; FALLBACK3-NEXT: negl %esi +; FALLBACK3-NEXT: movslq %esi, %r8 +; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax +; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9 +; FALLBACK3-NEXT: movq %r9, %rsi +; FALLBACK3-NEXT: shldq %cl, %rax, %rsi +; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10 +; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi +; FALLBACK3-NEXT: shldq %cl, %rdi, %rax +; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11 +; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx +; FALLBACK3-NEXT: movq %rbx, %r14 +; FALLBACK3-NEXT: shldq %cl, %r11, %r14 +; FALLBACK3-NEXT: shldq %cl, %r9, %r11 +; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9 +; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK3-NEXT: shldq %cl, %r9, %r8 +; FALLBACK3-NEXT: shldq %cl, %rbx, %r9 +; FALLBACK3-NEXT: shldq %cl, %r10, %rdi +; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx +; FALLBACK3-NEXT: movq %r9, 48(%rdx) +; FALLBACK3-NEXT: movq %r8, 56(%rdx) +; FALLBACK3-NEXT: movq %r11, 32(%rdx) +; FALLBACK3-NEXT: movq %r14, 40(%rdx) +; FALLBACK3-NEXT: movq %rax, 16(%rdx) +; FALLBACK3-NEXT: movq %rsi, 24(%rdx) +; FALLBACK3-NEXT: movq %rcx, (%rdx) +; FALLBACK3-NEXT: movq %rdi, 8(%rdx) +; FALLBACK3-NEXT: addq $8, %rsp +; FALLBACK3-NEXT: popq %rbx +; FALLBACK3-NEXT: popq %r14 +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: shl_64bytes: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %r15 +; FALLBACK4-NEXT: pushq %r14 +; FALLBACK4-NEXT: pushq %r13 +; FALLBACK4-NEXT: pushq %r12 +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK4-NEXT: movl (%rsi), %ecx +; FALLBACK4-NEXT: xorps %xmm4, %xmm4 +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: leal (,%rcx,8), %eax +; FALLBACK4-NEXT: andl $56, %eax +; FALLBACK4-NEXT: andl $56, %ecx +; FALLBACK4-NEXT: negl %ecx +; FALLBACK4-NEXT: movslq %ecx, %r9 +; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi +; FALLBACK4-NEXT: movq %rdi, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11 +; FALLBACK4-NEXT: movq %r11, %r8 +; FALLBACK4-NEXT: shrq %r8 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r8 +; FALLBACK4-NEXT: orq %r10, %r8 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx +; FALLBACK4-NEXT: movq %rbx, %r10 +; FALLBACK4-NEXT: shrq %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: orq %r11, %r10 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15 +; FALLBACK4-NEXT: movq %r15, %r11 +; FALLBACK4-NEXT: shrq %r11 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r11 +; FALLBACK4-NEXT: orq %rbx, %r11 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r15 +; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14 +; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12 +; FALLBACK4-NEXT: movq %r12, %rbx +; FALLBACK4-NEXT: shrq %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: orq %r15, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r12 +; FALLBACK4-NEXT: movq %r14, %r15 +; FALLBACK4-NEXT: shrq %r15 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r15 +; FALLBACK4-NEXT: orq %r12, %r15 +; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12 +; FALLBACK4-NEXT: movq %r12, %r13 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r13 +; FALLBACK4-NEXT: shrq %rdi +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %rdi +; FALLBACK4-NEXT: orq %r13, %rdi +; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r9 +; FALLBACK4-NEXT: shrq %r12 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shrq %cl, %r12 +; FALLBACK4-NEXT: orq %r9, %r12 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shlq %cl, %r14 +; FALLBACK4-NEXT: movq %r14, (%rdx) +; FALLBACK4-NEXT: movq %r12, 56(%rdx) +; FALLBACK4-NEXT: movq %rdi, 48(%rdx) +; FALLBACK4-NEXT: movq %r15, 8(%rdx) +; FALLBACK4-NEXT: movq %rbx, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 24(%rdx) +; FALLBACK4-NEXT: movq %r10, 32(%rdx) +; FALLBACK4-NEXT: movq %r8, 40(%rdx) +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r12 +; FALLBACK4-NEXT: popq %r13 +; FALLBACK4-NEXT: popq %r14 +; FALLBACK4-NEXT: popq %r15 +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: shl_64bytes: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: pushq %r15 +; FALLBACK5-NEXT: pushq %r14 +; FALLBACK5-NEXT: pushq %rbx +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK5-NEXT: movl (%rsi), %eax +; FALLBACK5-NEXT: xorps %xmm4, %xmm4 +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: leal (,%rax,8), %ecx +; FALLBACK5-NEXT: andl $56, %ecx +; FALLBACK5-NEXT: andl $56, %eax +; FALLBACK5-NEXT: negl %eax +; FALLBACK5-NEXT: movslq %eax, %r8 +; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK5-NEXT: movq %r9, %rsi +; FALLBACK5-NEXT: shldq %cl, %rax, %rsi +; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi +; FALLBACK5-NEXT: shldq %cl, %rdi, %rax +; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK5-NEXT: shldq %cl, %r10, %rdi +; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11 +; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx +; FALLBACK5-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14 +; FALLBACK5-NEXT: movq %r14, %r15 +; FALLBACK5-NEXT: shldq %cl, %r9, %r15 +; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK5-NEXT: shldq %cl, %r14, %r8 +; FALLBACK5-NEXT: movq %r11, %r9 +; FALLBACK5-NEXT: shlq %cl, %r9 +; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK5-NEXT: shldq %cl, %r11, %rbx +; FALLBACK5-NEXT: movq %r8, 56(%rdx) +; FALLBACK5-NEXT: movq %r15, 48(%rdx) +; FALLBACK5-NEXT: movq %rbx, 8(%rdx) +; FALLBACK5-NEXT: movq %r10, 16(%rdx) +; FALLBACK5-NEXT: movq %rdi, 24(%rdx) +; FALLBACK5-NEXT: movq %rax, 32(%rdx) +; FALLBACK5-NEXT: movq %rsi, 40(%rdx) +; FALLBACK5-NEXT: movq %r9, (%rdx) +; FALLBACK5-NEXT: popq %rbx +; FALLBACK5-NEXT: popq %r14 +; FALLBACK5-NEXT: popq %r15 +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: shl_64bytes: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: pushq %rbp +; FALLBACK6-NEXT: pushq %r15 +; FALLBACK6-NEXT: pushq %r14 +; FALLBACK6-NEXT: pushq %r13 +; FALLBACK6-NEXT: pushq %r12 +; FALLBACK6-NEXT: pushq %rbx +; FALLBACK6-NEXT: subq $24, %rsp +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: xorps %xmm4, %xmm4 +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm3, (%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: andl $56, %eax +; FALLBACK6-NEXT: negl %eax +; FALLBACK6-NEXT: movslq %eax, %rsi +; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax +; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 +; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 +; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx +; FALLBACK6-NEXT: movl %ecx, %r9d +; FALLBACK6-NEXT: notb %r9b +; FALLBACK6-NEXT: shrq %rdi +; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK6-NEXT: orq %r12, %rdi +; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp +; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 +; FALLBACK6-NEXT: shrq %r13 +; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 +; FALLBACK6-NEXT: orq %r15, %r12 +; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: shrq %r11 +; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 +; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK6-NEXT: shrq %r14 +; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 +; FALLBACK6-NEXT: orq %r10, %r14 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi +; FALLBACK6-NEXT: orq %rbx, %rsi +; FALLBACK6-NEXT: shrq %rax +; FALLBACK6-NEXT: shrxq %r9, %rax, %rax +; FALLBACK6-NEXT: orq %r8, %rax +; FALLBACK6-NEXT: shrq %rbp +; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 +; FALLBACK6-NEXT: orq %r15, %r8 +; FALLBACK6-NEXT: movq %rcx, (%rdx) +; FALLBACK6-NEXT: movq %r8, 56(%rdx) +; FALLBACK6-NEXT: movq %rax, 48(%rdx) +; FALLBACK6-NEXT: movq %rsi, 8(%rdx) +; FALLBACK6-NEXT: movq %r14, 16(%rdx) +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %r12, 32(%rdx) +; FALLBACK6-NEXT: movq %rdi, 40(%rdx) +; FALLBACK6-NEXT: addq $24, %rsp +; FALLBACK6-NEXT: popq %rbx +; FALLBACK6-NEXT: popq %r12 +; FALLBACK6-NEXT: popq %r13 +; FALLBACK6-NEXT: popq %r14 +; FALLBACK6-NEXT: popq %r15 +; FALLBACK6-NEXT: popq %rbp +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: shl_64bytes: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: pushq %r15 +; FALLBACK7-NEXT: pushq %r14 +; FALLBACK7-NEXT: pushq %rbx +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 +; FALLBACK7-NEXT: movl (%rsi), %eax +; FALLBACK7-NEXT: xorps %xmm4, %xmm4 +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: leal (,%rax,8), %ecx +; FALLBACK7-NEXT: andl $56, %ecx +; FALLBACK7-NEXT: andl $56, %eax +; FALLBACK7-NEXT: negl %eax +; FALLBACK7-NEXT: movslq %eax, %r8 +; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK7-NEXT: movq %r9, %rsi +; FALLBACK7-NEXT: shldq %cl, %rax, %rsi +; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi +; FALLBACK7-NEXT: shldq %cl, %rdi, %rax +; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK7-NEXT: shldq %cl, %r10, %rdi +; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11 +; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx +; FALLBACK7-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14 +; FALLBACK7-NEXT: movq %r14, %r15 +; FALLBACK7-NEXT: shldq %cl, %r9, %r15 +; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK7-NEXT: shldq %cl, %r14, %r8 +; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9 +; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK7-NEXT: shldq %cl, %r11, %rbx +; FALLBACK7-NEXT: movq %r8, 56(%rdx) +; FALLBACK7-NEXT: movq %r15, 48(%rdx) +; FALLBACK7-NEXT: movq %rbx, 8(%rdx) +; FALLBACK7-NEXT: movq %r10, 16(%rdx) +; FALLBACK7-NEXT: movq %rdi, 24(%rdx) +; FALLBACK7-NEXT: movq %rax, 32(%rdx) +; FALLBACK7-NEXT: movq %rsi, 40(%rdx) +; FALLBACK7-NEXT: movq %r9, (%rdx) +; FALLBACK7-NEXT: popq %rbx +; FALLBACK7-NEXT: popq %r14 +; FALLBACK7-NEXT: popq %r15 +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: shl_64bytes: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %r15 +; FALLBACK8-NEXT: pushq %r14 +; FALLBACK8-NEXT: pushq %r13 +; FALLBACK8-NEXT: pushq %r12 +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK8-NEXT: movl (%rsi), %ecx +; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: leal (,%rcx,8), %eax +; FALLBACK8-NEXT: andl $56, %eax +; FALLBACK8-NEXT: andl $56, %ecx +; FALLBACK8-NEXT: negl %ecx +; FALLBACK8-NEXT: movslq %ecx, %r9 +; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi +; FALLBACK8-NEXT: movq %rdi, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11 +; FALLBACK8-NEXT: movq %r11, %r8 +; FALLBACK8-NEXT: shrq %r8 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r8 +; FALLBACK8-NEXT: orq %r10, %r8 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx +; FALLBACK8-NEXT: movq %rbx, %r10 +; FALLBACK8-NEXT: shrq %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: orq %r11, %r10 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15 +; FALLBACK8-NEXT: movq %r15, %r11 +; FALLBACK8-NEXT: shrq %r11 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r11 +; FALLBACK8-NEXT: orq %rbx, %r11 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r15 +; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14 +; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12 +; FALLBACK8-NEXT: movq %r12, %rbx +; FALLBACK8-NEXT: shrq %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: orq %r15, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r12 +; FALLBACK8-NEXT: movq %r14, %r15 +; FALLBACK8-NEXT: shrq %r15 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r15 +; FALLBACK8-NEXT: orq %r12, %r15 +; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12 +; FALLBACK8-NEXT: movq %r12, %r13 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r13 +; FALLBACK8-NEXT: shrq %rdi +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %rdi +; FALLBACK8-NEXT: orq %r13, %rdi +; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r9 +; FALLBACK8-NEXT: shrq %r12 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shrq %cl, %r12 +; FALLBACK8-NEXT: orq %r9, %r12 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shlq %cl, %r14 +; FALLBACK8-NEXT: movq %r14, (%rdx) +; FALLBACK8-NEXT: movq %r12, 56(%rdx) +; FALLBACK8-NEXT: movq %rdi, 48(%rdx) +; FALLBACK8-NEXT: movq %r15, 8(%rdx) +; FALLBACK8-NEXT: movq %rbx, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 24(%rdx) +; FALLBACK8-NEXT: movq %r10, 32(%rdx) +; FALLBACK8-NEXT: movq %r8, 40(%rdx) +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r12 +; FALLBACK8-NEXT: popq %r13 +; FALLBACK8-NEXT: popq %r14 +; FALLBACK8-NEXT: popq %r15 +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: shl_64bytes: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: pushq %r15 +; FALLBACK9-NEXT: pushq %r14 +; FALLBACK9-NEXT: pushq %rbx +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: andl $56, %ecx +; FALLBACK9-NEXT: andl $56, %eax +; FALLBACK9-NEXT: negl %eax +; FALLBACK9-NEXT: movslq %eax, %r8 +; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK9-NEXT: movq %r9, %rsi +; FALLBACK9-NEXT: shldq %cl, %rax, %rsi +; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi +; FALLBACK9-NEXT: shldq %cl, %rdi, %rax +; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK9-NEXT: shldq %cl, %r10, %rdi +; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11 +; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx +; FALLBACK9-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14 +; FALLBACK9-NEXT: movq %r14, %r15 +; FALLBACK9-NEXT: shldq %cl, %r9, %r15 +; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK9-NEXT: shldq %cl, %r14, %r8 +; FALLBACK9-NEXT: movq %r11, %r9 +; FALLBACK9-NEXT: shlq %cl, %r9 +; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK9-NEXT: shldq %cl, %r11, %rbx +; FALLBACK9-NEXT: movq %r8, 56(%rdx) +; FALLBACK9-NEXT: movq %r15, 48(%rdx) +; FALLBACK9-NEXT: movq %rbx, 8(%rdx) +; FALLBACK9-NEXT: movq %r10, 16(%rdx) +; FALLBACK9-NEXT: movq %rdi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 40(%rdx) +; FALLBACK9-NEXT: movq %r9, (%rdx) +; FALLBACK9-NEXT: popq %rbx +; FALLBACK9-NEXT: popq %r14 +; FALLBACK9-NEXT: popq %r15 +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: shl_64bytes: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: pushq %rbp +; FALLBACK10-NEXT: pushq %r15 +; FALLBACK10-NEXT: pushq %r14 +; FALLBACK10-NEXT: pushq %r13 +; FALLBACK10-NEXT: pushq %r12 +; FALLBACK10-NEXT: pushq %rbx +; FALLBACK10-NEXT: subq $24, %rsp +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: leal (,%rax,8), %ecx +; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: andl $56, %eax +; FALLBACK10-NEXT: negl %eax +; FALLBACK10-NEXT: movslq %eax, %rsi +; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax +; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 +; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 +; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 +; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx +; FALLBACK10-NEXT: movl %ecx, %r9d +; FALLBACK10-NEXT: notb %r9b +; FALLBACK10-NEXT: shrq %rdi +; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK10-NEXT: orq %r12, %rdi +; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp +; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 +; FALLBACK10-NEXT: shrq %r13 +; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 +; FALLBACK10-NEXT: orq %r15, %r12 +; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: shrq %r11 +; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 +; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK10-NEXT: shrq %r14 +; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 +; FALLBACK10-NEXT: orq %r10, %r14 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi +; FALLBACK10-NEXT: orq %rbx, %rsi +; FALLBACK10-NEXT: shrq %rax +; FALLBACK10-NEXT: shrxq %r9, %rax, %rax +; FALLBACK10-NEXT: orq %r8, %rax +; FALLBACK10-NEXT: shrq %rbp +; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 +; FALLBACK10-NEXT: orq %r15, %r8 +; FALLBACK10-NEXT: movq %rcx, (%rdx) +; FALLBACK10-NEXT: movq %r8, 56(%rdx) +; FALLBACK10-NEXT: movq %rax, 48(%rdx) +; FALLBACK10-NEXT: movq %rsi, 8(%rdx) +; FALLBACK10-NEXT: movq %r14, 16(%rdx) +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %r12, 32(%rdx) +; FALLBACK10-NEXT: movq %rdi, 40(%rdx) +; FALLBACK10-NEXT: addq $24, %rsp +; FALLBACK10-NEXT: popq %rbx +; FALLBACK10-NEXT: popq %r12 +; FALLBACK10-NEXT: popq %r13 +; FALLBACK10-NEXT: popq %r14 +; FALLBACK10-NEXT: popq %r15 +; FALLBACK10-NEXT: popq %rbp +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: shl_64bytes: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: pushq %r15 +; FALLBACK11-NEXT: pushq %r14 +; FALLBACK11-NEXT: pushq %rbx +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 +; FALLBACK11-NEXT: movl (%rsi), %eax +; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: leal (,%rax,8), %ecx +; FALLBACK11-NEXT: andl $56, %ecx +; FALLBACK11-NEXT: andl $56, %eax +; FALLBACK11-NEXT: negl %eax +; FALLBACK11-NEXT: movslq %eax, %r8 +; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK11-NEXT: movq %r9, %rsi +; FALLBACK11-NEXT: shldq %cl, %rax, %rsi +; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi +; FALLBACK11-NEXT: shldq %cl, %rdi, %rax +; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK11-NEXT: shldq %cl, %r10, %rdi +; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11 +; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx +; FALLBACK11-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14 +; FALLBACK11-NEXT: movq %r14, %r15 +; FALLBACK11-NEXT: shldq %cl, %r9, %r15 +; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK11-NEXT: shldq %cl, %r14, %r8 +; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9 +; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK11-NEXT: shldq %cl, %r11, %rbx +; FALLBACK11-NEXT: movq %r8, 56(%rdx) +; FALLBACK11-NEXT: movq %r15, 48(%rdx) +; FALLBACK11-NEXT: movq %rbx, 8(%rdx) +; FALLBACK11-NEXT: movq %r10, 16(%rdx) +; FALLBACK11-NEXT: movq %rdi, 24(%rdx) +; FALLBACK11-NEXT: movq %rax, 32(%rdx) +; FALLBACK11-NEXT: movq %rsi, 40(%rdx) +; FALLBACK11-NEXT: movq %r9, (%rdx) +; FALLBACK11-NEXT: popq %rbx +; FALLBACK11-NEXT: popq %r14 +; FALLBACK11-NEXT: popq %r15 +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: shl_64bytes: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %r15 +; FALLBACK12-NEXT: pushq %r14 +; FALLBACK12-NEXT: pushq %r13 +; FALLBACK12-NEXT: pushq %r12 +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK12-NEXT: movl (%rsi), %ecx +; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: leal (,%rcx,8), %eax +; FALLBACK12-NEXT: andl $56, %eax +; FALLBACK12-NEXT: andl $56, %ecx +; FALLBACK12-NEXT: negl %ecx +; FALLBACK12-NEXT: movslq %ecx, %r9 +; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi +; FALLBACK12-NEXT: movq %rdi, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11 +; FALLBACK12-NEXT: movq %r11, %r8 +; FALLBACK12-NEXT: shrq %r8 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r8 +; FALLBACK12-NEXT: orq %r10, %r8 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq %rbx, %r10 +; FALLBACK12-NEXT: shrq %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: orq %r11, %r10 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15 +; FALLBACK12-NEXT: movq %r15, %r11 +; FALLBACK12-NEXT: shrq %r11 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r11 +; FALLBACK12-NEXT: orq %rbx, %r11 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r15 +; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14 +; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq %r12, %rbx +; FALLBACK12-NEXT: shrq %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: orq %r15, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r12 +; FALLBACK12-NEXT: movq %r14, %r15 +; FALLBACK12-NEXT: shrq %r15 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r15 +; FALLBACK12-NEXT: orq %r12, %r15 +; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq %r12, %r13 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r13 +; FALLBACK12-NEXT: shrq %rdi +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %rdi +; FALLBACK12-NEXT: orq %r13, %rdi +; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: shrq %r12 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shrq %cl, %r12 +; FALLBACK12-NEXT: orq %r9, %r12 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shlq %cl, %r14 +; FALLBACK12-NEXT: movq %r14, (%rdx) +; FALLBACK12-NEXT: movq %r12, 56(%rdx) +; FALLBACK12-NEXT: movq %rdi, 48(%rdx) +; FALLBACK12-NEXT: movq %r15, 8(%rdx) +; FALLBACK12-NEXT: movq %rbx, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 24(%rdx) +; FALLBACK12-NEXT: movq %r10, 32(%rdx) +; FALLBACK12-NEXT: movq %r8, 40(%rdx) +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r12 +; FALLBACK12-NEXT: popq %r13 +; FALLBACK12-NEXT: popq %r14 +; FALLBACK12-NEXT: popq %r15 +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: shl_64bytes: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: pushq %r15 +; FALLBACK13-NEXT: pushq %r14 +; FALLBACK13-NEXT: pushq %rbx +; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK13-NEXT: movl (%rsi), %eax +; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: leal (,%rax,8), %ecx +; FALLBACK13-NEXT: andl $56, %ecx +; FALLBACK13-NEXT: andl $56, %eax +; FALLBACK13-NEXT: negl %eax +; FALLBACK13-NEXT: movslq %eax, %r8 +; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK13-NEXT: movq %r9, %rsi +; FALLBACK13-NEXT: shldq %cl, %rax, %rsi +; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi +; FALLBACK13-NEXT: shldq %cl, %rdi, %rax +; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK13-NEXT: shldq %cl, %r10, %rdi +; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11 +; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx +; FALLBACK13-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14 +; FALLBACK13-NEXT: movq %r14, %r15 +; FALLBACK13-NEXT: shldq %cl, %r9, %r15 +; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK13-NEXT: shldq %cl, %r14, %r8 +; FALLBACK13-NEXT: movq %r11, %r9 +; FALLBACK13-NEXT: shlq %cl, %r9 +; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK13-NEXT: shldq %cl, %r11, %rbx +; FALLBACK13-NEXT: movq %r8, 56(%rdx) +; FALLBACK13-NEXT: movq %r15, 48(%rdx) +; FALLBACK13-NEXT: movq %rbx, 8(%rdx) +; FALLBACK13-NEXT: movq %r10, 16(%rdx) +; FALLBACK13-NEXT: movq %rdi, 24(%rdx) +; FALLBACK13-NEXT: movq %rax, 32(%rdx) +; FALLBACK13-NEXT: movq %rsi, 40(%rdx) +; FALLBACK13-NEXT: movq %r9, (%rdx) +; FALLBACK13-NEXT: popq %rbx +; FALLBACK13-NEXT: popq %r14 +; FALLBACK13-NEXT: popq %r15 +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: shl_64bytes: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: pushq %rbp +; FALLBACK14-NEXT: pushq %r15 +; FALLBACK14-NEXT: pushq %r14 +; FALLBACK14-NEXT: pushq %r13 +; FALLBACK14-NEXT: pushq %r12 +; FALLBACK14-NEXT: pushq %rbx +; FALLBACK14-NEXT: subq $24, %rsp +; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK14-NEXT: movl (%rsi), %eax +; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: leal (,%rax,8), %ecx +; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: andl $56, %eax +; FALLBACK14-NEXT: negl %eax +; FALLBACK14-NEXT: movslq %eax, %rsi +; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax +; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 +; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 +; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 +; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx +; FALLBACK14-NEXT: movl %ecx, %r9d +; FALLBACK14-NEXT: notb %r9b +; FALLBACK14-NEXT: shrq %rdi +; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK14-NEXT: orq %r12, %rdi +; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp +; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 +; FALLBACK14-NEXT: shrq %r13 +; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 +; FALLBACK14-NEXT: orq %r15, %r12 +; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: shrq %r11 +; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 +; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload +; FALLBACK14-NEXT: shrq %r14 +; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 +; FALLBACK14-NEXT: orq %r10, %r14 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi +; FALLBACK14-NEXT: orq %rbx, %rsi +; FALLBACK14-NEXT: shrq %rax +; FALLBACK14-NEXT: shrxq %r9, %rax, %rax +; FALLBACK14-NEXT: orq %r8, %rax +; FALLBACK14-NEXT: shrq %rbp +; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 +; FALLBACK14-NEXT: orq %r15, %r8 +; FALLBACK14-NEXT: movq %rcx, (%rdx) +; FALLBACK14-NEXT: movq %r8, 56(%rdx) +; FALLBACK14-NEXT: movq %rax, 48(%rdx) +; FALLBACK14-NEXT: movq %rsi, 8(%rdx) +; FALLBACK14-NEXT: movq %r14, 16(%rdx) +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %r12, 32(%rdx) +; FALLBACK14-NEXT: movq %rdi, 40(%rdx) +; FALLBACK14-NEXT: addq $24, %rsp +; FALLBACK14-NEXT: popq %rbx +; FALLBACK14-NEXT: popq %r12 +; FALLBACK14-NEXT: popq %r13 +; FALLBACK14-NEXT: popq %r14 +; FALLBACK14-NEXT: popq %r15 +; FALLBACK14-NEXT: popq %rbp +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: shl_64bytes: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: pushq %r15 +; FALLBACK15-NEXT: pushq %r14 +; FALLBACK15-NEXT: pushq %rbx +; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 +; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: andl $56, %ecx +; FALLBACK15-NEXT: andl $56, %eax +; FALLBACK15-NEXT: negl %eax +; FALLBACK15-NEXT: movslq %eax, %r8 +; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax +; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9 +; FALLBACK15-NEXT: movq %r9, %rsi +; FALLBACK15-NEXT: shldq %cl, %rax, %rsi +; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi +; FALLBACK15-NEXT: shldq %cl, %rdi, %rax +; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10 +; FALLBACK15-NEXT: shldq %cl, %r10, %rdi +; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11 +; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx +; FALLBACK15-NEXT: shldq %cl, %rbx, %r10 +; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14 +; FALLBACK15-NEXT: movq %r14, %r15 +; FALLBACK15-NEXT: shldq %cl, %r9, %r15 +; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8 +; FALLBACK15-NEXT: shldq %cl, %r14, %r8 +; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9 +; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK15-NEXT: shldq %cl, %r11, %rbx +; FALLBACK15-NEXT: movq %r8, 56(%rdx) +; FALLBACK15-NEXT: movq %r15, 48(%rdx) +; FALLBACK15-NEXT: movq %rbx, 8(%rdx) +; FALLBACK15-NEXT: movq %r10, 16(%rdx) +; FALLBACK15-NEXT: movq %rdi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 32(%rdx) +; FALLBACK15-NEXT: movq %rsi, 40(%rdx) +; FALLBACK15-NEXT: movq %r9, (%rdx) +; FALLBACK15-NEXT: popq %rbx +; FALLBACK15-NEXT: popq %r14 +; FALLBACK15-NEXT: popq %r15 +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; FALLBACK16-LABEL: shl_64bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $204, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl (%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 8(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 12(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 16(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 20(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 24(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 28(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 32(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%eax), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 40(%eax), %ebp +; FALLBACK16-NEXT: movl 44(%eax), %ebx +; FALLBACK16-NEXT: movl 48(%eax), %edi +; FALLBACK16-NEXT: movl 52(%eax), %esi +; FALLBACK16-NEXT: movl 56(%eax), %edx +; FALLBACK16-NEXT: movl 60(%eax), %ecx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl (%eax), %eax +; FALLBACK16-NEXT: xorps %xmm0, %xmm0 +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %eax, %edx +; FALLBACK16-NEXT: andl $60, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: subl %edx, %ecx +; FALLBACK16-NEXT: movl (%ecx), %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%ecx), %edx +; FALLBACK16-NEXT: movl %ecx, %ebp +; FALLBACK16-NEXT: shll $3, %eax +; FALLBACK16-NEXT: andl $24, %eax +; FALLBACK16-NEXT: movl %edx, %esi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %al, %ch +; FALLBACK16-NEXT: notb %ch +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %esi, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 12(%ebp), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl 8(%ebp), %esi +; FALLBACK16-NEXT: movl %ebp, %edi +; FALLBACK16-NEXT: movl %esi, %ebp +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: orl %esi, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %edi, %ebp +; FALLBACK16-NEXT: movl 20(%edi), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl 16(%edi), %esi +; FALLBACK16-NEXT: movl %esi, %edx +; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: orl %ebx, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %esi, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movl 28(%ebp), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl 24(%ebp), %esi +; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl %esi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%edx), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl 32(%edx), %esi +; FALLBACK16-NEXT: movl %edx, %ebp +; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: orl %esi, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 44(%ebp), %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl 40(%ebp), %esi +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %esi, %edx +; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: orl %ebx, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: orl %esi, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 52(%ebp), %esi +; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: negl %edx +; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK16-NEXT: movl %ebx, %ebp +; FALLBACK16-NEXT: shrl %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shrl %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: orl %ebx, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: movl 60(%edi), %edx +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl 56(%edi), %ebx +; FALLBACK16-NEXT: movl %ebx, %edi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %edx, %edi +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: orl %ebx, %esi +; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl %edx, (%eax) +; FALLBACK16-NEXT: movl %esi, 56(%eax) +; FALLBACK16-NEXT: movl %edi, 60(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 48(%eax) +; FALLBACK16-NEXT: movl %ebp, 52(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 40(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 44(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 32(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 36(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 24(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 28(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 16(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 20(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 12(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 4(%eax) +; FALLBACK16-NEXT: addl $204, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: shl_64bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $188, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl (%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 8(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 12(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 16(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 20(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 24(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 28(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 32(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 36(%ecx), %eax +; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%ecx), %ebp +; FALLBACK17-NEXT: movl 44(%ecx), %ebx +; FALLBACK17-NEXT: movl 48(%ecx), %edi +; FALLBACK17-NEXT: movl 52(%ecx), %esi +; FALLBACK17-NEXT: movl 56(%ecx), %edx +; FALLBACK17-NEXT: movl 60(%ecx), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl (%ecx), %ecx +; FALLBACK17-NEXT: xorps %xmm0, %xmm0 +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ecx, %ebp +; FALLBACK17-NEXT: andl $60, %ebp +; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: subl %ebp, %eax +; FALLBACK17-NEXT: movl 8(%eax), %esi +; FALLBACK17-NEXT: movl 12(%eax), %edx +; FALLBACK17-NEXT: shll $3, %ecx +; FALLBACK17-NEXT: andl $24, %ecx +; FALLBACK17-NEXT: movl %edx, %edi +; FALLBACK17-NEXT: shldl %cl, %esi, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%eax), %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 16(%eax), %edi +; FALLBACK17-NEXT: movl 20(%eax), %esi +; FALLBACK17-NEXT: movl %esi, %ebx +; FALLBACK17-NEXT: shldl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %edx, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 24(%eax), %edi +; FALLBACK17-NEXT: movl 28(%eax), %edx +; FALLBACK17-NEXT: movl %edx, %ebx +; FALLBACK17-NEXT: shldl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %esi, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 32(%eax), %edi +; FALLBACK17-NEXT: movl 36(%eax), %esi +; FALLBACK17-NEXT: movl %esi, %ebx +; FALLBACK17-NEXT: shldl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %edx, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%eax), %edx +; FALLBACK17-NEXT: movl 44(%eax), %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %edx, %edi +; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 56(%eax), %edx +; FALLBACK17-NEXT: movl 60(%eax), %edi +; FALLBACK17-NEXT: shldl %cl, %edx, %edi +; FALLBACK17-NEXT: movl (%eax), %ebx +; FALLBACK17-NEXT: movl 52(%eax), %esi +; FALLBACK17-NEXT: shldl %cl, %esi, %edx +; FALLBACK17-NEXT: negl %ebp +; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl %edx, 56(%ebp) +; FALLBACK17-NEXT: movl %edi, 60(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: shldl %cl, %ebx, %edx +; FALLBACK17-NEXT: shll %cl, %ebx +; FALLBACK17-NEXT: shldl %cl, %eax, %esi +; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK17-NEXT: shldl %cl, %edi, %eax +; FALLBACK17-NEXT: movl %eax, 48(%ebp) +; FALLBACK17-NEXT: movl %esi, 52(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 40(%ebp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 44(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 32(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 36(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 24(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 16(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 20(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ebp) +; FALLBACK17-NEXT: movl %ebx, (%ebp) +; FALLBACK17-NEXT: movl %edx, 4(%ebp) +; FALLBACK17-NEXT: addl $188, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: shl_64bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $204, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 12(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 16(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 20(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 24(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 28(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 36(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%eax), %ebx +; FALLBACK18-NEXT: movl 44(%eax), %edi +; FALLBACK18-NEXT: movl 48(%eax), %esi +; FALLBACK18-NEXT: movl 52(%eax), %edx +; FALLBACK18-NEXT: movl 56(%eax), %ecx +; FALLBACK18-NEXT: movl 60(%eax), %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK18-NEXT: movl (%ebp), %ebp +; FALLBACK18-NEXT: xorps %xmm0, %xmm0 +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: leal (,%ebp,8), %edx +; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: andl $60, %ebp +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: subl %ebp, %edi +; FALLBACK18-NEXT: movl (%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%edi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%edi), %esi +; FALLBACK18-NEXT: movl %esi, %ecx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: movl 12(%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %esi +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 16(%edi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: movl 20(%edi), %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 24(%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: movl 28(%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %eax, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%edi), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: movl 36(%edi), %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%edi), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: movl 44(%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %eax, %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%edi), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 52(%edi), %esi +; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp +; FALLBACK18-NEXT: orl %eax, %ebp +; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: negl %eax +; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx +; FALLBACK18-NEXT: movl 56(%edi), %eax +; FALLBACK18-NEXT: shlxl %edx, %eax, %edx +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %edx, %esi +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, (%eax) +; FALLBACK18-NEXT: movl %esi, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 60(%eax) +; FALLBACK18-NEXT: movl %ebp, 48(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 44(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 32(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 36(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 24(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 28(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 16(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 20(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 8(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl $204, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: shl_64bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $204, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl (%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 12(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 16(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 20(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 24(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 28(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 32(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 36(%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%ebp), %ebx +; FALLBACK19-NEXT: movl 44(%ebp), %edi +; FALLBACK19-NEXT: movl 48(%ebp), %esi +; FALLBACK19-NEXT: movl 52(%ebp), %edx +; FALLBACK19-NEXT: movl 56(%ebp), %ecx +; FALLBACK19-NEXT: movl 60(%ebp), %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl (%ebp), %ebp +; FALLBACK19-NEXT: xorps %xmm0, %xmm0 +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: leal (,%ebp,8), %ecx +; FALLBACK19-NEXT: andl $24, %ecx +; FALLBACK19-NEXT: andl $60, %ebp +; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: subl %ebp, %eax +; FALLBACK19-NEXT: movl 4(%eax), %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%eax), %edi +; FALLBACK19-NEXT: movl 12(%eax), %edx +; FALLBACK19-NEXT: movl %edx, %ebx +; FALLBACK19-NEXT: shldl %cl, %edi, %ebx +; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %esi, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 16(%eax), %edi +; FALLBACK19-NEXT: movl 20(%eax), %esi +; FALLBACK19-NEXT: movl %esi, %ebx +; FALLBACK19-NEXT: shldl %cl, %edi, %ebx +; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %edx, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 24(%eax), %edi +; FALLBACK19-NEXT: movl 28(%eax), %edx +; FALLBACK19-NEXT: movl %edx, %ebx +; FALLBACK19-NEXT: shldl %cl, %edi, %ebx +; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %esi, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 32(%eax), %edi +; FALLBACK19-NEXT: movl 36(%eax), %esi +; FALLBACK19-NEXT: movl %esi, %ebx +; FALLBACK19-NEXT: shldl %cl, %edi, %ebx +; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %edx, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%eax), %ebx +; FALLBACK19-NEXT: movl 44(%eax), %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %ebx, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shldl %cl, %esi, %ebx +; FALLBACK19-NEXT: movl 56(%eax), %edx +; FALLBACK19-NEXT: movl 60(%eax), %edi +; FALLBACK19-NEXT: shldl %cl, %edx, %edi +; FALLBACK19-NEXT: movl (%eax), %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 52(%eax), %esi +; FALLBACK19-NEXT: shldl %cl, %esi, %edx +; FALLBACK19-NEXT: negl %ebp +; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl %edx, 56(%eax) +; FALLBACK19-NEXT: movl %edi, 60(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK19-NEXT: shldl %cl, %edx, %edi +; FALLBACK19-NEXT: shldl %cl, %ebp, %esi +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: shldl %cl, %edx, %ebp +; FALLBACK19-NEXT: movl %ebp, 48(%eax) +; FALLBACK19-NEXT: movl %esi, 52(%eax) +; FALLBACK19-NEXT: movl %ebx, 40(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 44(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 32(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 36(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 24(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 28(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 16(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 20(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 8(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 12(%eax) +; FALLBACK19-NEXT: movl %edi, 4(%eax) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, (%eax) +; FALLBACK19-NEXT: addl $204, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: shl_64bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $204, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK20-NEXT: movl (%eax), %eax +; FALLBACK20-NEXT: xorps %xmm4, %xmm4 +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: andl $60, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: subl %edx, %ecx +; FALLBACK20-NEXT: movl (%ecx), %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 4(%ecx), %edx +; FALLBACK20-NEXT: movl %ecx, %ebp +; FALLBACK20-NEXT: shll $3, %eax +; FALLBACK20-NEXT: andl $24, %eax +; FALLBACK20-NEXT: movl %edx, %esi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %al, %ch +; FALLBACK20-NEXT: notb %ch +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %esi, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 12(%ebp), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl 8(%ebp), %esi +; FALLBACK20-NEXT: movl %ebp, %edi +; FALLBACK20-NEXT: movl %esi, %ebp +; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: orl %esi, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edi, %ebp +; FALLBACK20-NEXT: movl 20(%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl 16(%edi), %esi +; FALLBACK20-NEXT: movl %esi, %edx +; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: orl %ebx, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %esi, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %ebp, %edx +; FALLBACK20-NEXT: movl 28(%ebp), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl 24(%ebp), %esi +; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl %esi, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 36(%edx), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl 32(%edx), %esi +; FALLBACK20-NEXT: movl %edx, %ebp +; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: orl %esi, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 44(%ebp), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl 40(%ebp), %esi +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %esi, %edx +; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: orl %ebx, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: orl %esi, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 52(%ebp), %esi +; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: negl %edx +; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: shrl %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: orl %ebx, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: movl 60(%edi), %edx +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl 56(%edi), %ebx +; FALLBACK20-NEXT: movl %ebx, %edi +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %edx, %edi +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl %edx, (%eax) +; FALLBACK20-NEXT: movl %esi, 56(%eax) +; FALLBACK20-NEXT: movl %edi, 60(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 48(%eax) +; FALLBACK20-NEXT: movl %ebp, 52(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 40(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 44(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 32(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 36(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 24(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 28(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 16(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 20(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 8(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 12(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 4(%eax) +; FALLBACK20-NEXT: addl $204, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: shl_64bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $188, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movups (%ecx), %xmm0 +; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK21-NEXT: movl (%eax), %ecx +; FALLBACK21-NEXT: xorps %xmm4, %xmm4 +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %ecx, %ebp +; FALLBACK21-NEXT: andl $60, %ebp +; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: subl %ebp, %eax +; FALLBACK21-NEXT: movl 8(%eax), %esi +; FALLBACK21-NEXT: movl 12(%eax), %edx +; FALLBACK21-NEXT: shll $3, %ecx +; FALLBACK21-NEXT: andl $24, %ecx +; FALLBACK21-NEXT: movl %edx, %edi +; FALLBACK21-NEXT: shldl %cl, %esi, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 4(%eax), %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 16(%eax), %edi +; FALLBACK21-NEXT: movl 20(%eax), %esi +; FALLBACK21-NEXT: movl %esi, %ebx +; FALLBACK21-NEXT: shldl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %edx, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 24(%eax), %edi +; FALLBACK21-NEXT: movl 28(%eax), %edx +; FALLBACK21-NEXT: movl %edx, %ebx +; FALLBACK21-NEXT: shldl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %esi, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 32(%eax), %edi +; FALLBACK21-NEXT: movl 36(%eax), %esi +; FALLBACK21-NEXT: movl %esi, %ebx +; FALLBACK21-NEXT: shldl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %edx, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 40(%eax), %edx +; FALLBACK21-NEXT: movl 44(%eax), %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %edx, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shldl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 56(%eax), %edx +; FALLBACK21-NEXT: movl 60(%eax), %edi +; FALLBACK21-NEXT: shldl %cl, %edx, %edi +; FALLBACK21-NEXT: movl (%eax), %ebx +; FALLBACK21-NEXT: movl 52(%eax), %esi +; FALLBACK21-NEXT: shldl %cl, %esi, %edx +; FALLBACK21-NEXT: negl %ebp +; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %edx, 56(%ebp) +; FALLBACK21-NEXT: movl %edi, 60(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: shldl %cl, %ebx, %edx +; FALLBACK21-NEXT: shll %cl, %ebx +; FALLBACK21-NEXT: shldl %cl, %eax, %esi +; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK21-NEXT: shldl %cl, %edi, %eax +; FALLBACK21-NEXT: movl %eax, 48(%ebp) +; FALLBACK21-NEXT: movl %esi, 52(%ebp) +; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 40(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 44(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 32(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 36(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 24(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 12(%ebp) +; FALLBACK21-NEXT: movl %ebx, (%ebp) +; FALLBACK21-NEXT: movl %edx, 4(%ebp) +; FALLBACK21-NEXT: addl $188, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: shl_64bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $204, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK22-NEXT: movl (%eax), %eax +; FALLBACK22-NEXT: xorps %xmm4, %xmm4 +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: leal (,%eax,8), %edx +; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: andl $60, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi +; FALLBACK22-NEXT: subl %eax, %edi +; FALLBACK22-NEXT: movl (%edi), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 4(%edi), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 8(%edi), %esi +; FALLBACK22-NEXT: movl %esi, %ecx +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: movl 12(%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 16(%edi), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: movl 20(%edi), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 24(%edi), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: movl 28(%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %eax, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 32(%edi), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: movl 36(%edi), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 40(%edi), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: movl 44(%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %eax, %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%edi), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 52(%edi), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp +; FALLBACK22-NEXT: orl %eax, %ebp +; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: negl %eax +; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx +; FALLBACK22-NEXT: movl 56(%edi), %eax +; FALLBACK22-NEXT: shlxl %edx, %eax, %edx +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %edx, %esi +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK22-NEXT: movl %edx, (%eax) +; FALLBACK22-NEXT: movl %esi, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 60(%eax) +; FALLBACK22-NEXT: movl %ebp, 48(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 44(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 32(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 36(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 24(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 28(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 16(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 20(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 8(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 12(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: addl $204, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: shl_64bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $204, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movups (%ecx), %xmm0 +; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 +; FALLBACK23-NEXT: movl (%eax), %ebp +; FALLBACK23-NEXT: xorps %xmm4, %xmm4 +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: leal (,%ebp,8), %ecx +; FALLBACK23-NEXT: andl $24, %ecx +; FALLBACK23-NEXT: andl $60, %ebp +; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: subl %ebp, %eax +; FALLBACK23-NEXT: movl 4(%eax), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 8(%eax), %edi +; FALLBACK23-NEXT: movl 12(%eax), %edx +; FALLBACK23-NEXT: movl %edx, %ebx +; FALLBACK23-NEXT: shldl %cl, %edi, %ebx +; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %esi, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 16(%eax), %edi +; FALLBACK23-NEXT: movl 20(%eax), %esi +; FALLBACK23-NEXT: movl %esi, %ebx +; FALLBACK23-NEXT: shldl %cl, %edi, %ebx +; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %edx, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 24(%eax), %edi +; FALLBACK23-NEXT: movl 28(%eax), %edx +; FALLBACK23-NEXT: movl %edx, %ebx +; FALLBACK23-NEXT: shldl %cl, %edi, %ebx +; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %esi, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 32(%eax), %edi +; FALLBACK23-NEXT: movl 36(%eax), %esi +; FALLBACK23-NEXT: movl %esi, %ebx +; FALLBACK23-NEXT: shldl %cl, %edi, %ebx +; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %edx, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 40(%eax), %ebx +; FALLBACK23-NEXT: movl 44(%eax), %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %ebx, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shldl %cl, %esi, %ebx +; FALLBACK23-NEXT: movl 56(%eax), %edx +; FALLBACK23-NEXT: movl 60(%eax), %edi +; FALLBACK23-NEXT: shldl %cl, %edx, %edi +; FALLBACK23-NEXT: movl (%eax), %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 52(%eax), %esi +; FALLBACK23-NEXT: shldl %cl, %esi, %edx +; FALLBACK23-NEXT: negl %ebp +; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movl %edx, 56(%eax) +; FALLBACK23-NEXT: movl %edi, 60(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK23-NEXT: shldl %cl, %edx, %edi +; FALLBACK23-NEXT: shldl %cl, %ebp, %esi +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: shldl %cl, %edx, %ebp +; FALLBACK23-NEXT: movl %ebp, 48(%eax) +; FALLBACK23-NEXT: movl %esi, 52(%eax) +; FALLBACK23-NEXT: movl %ebx, 40(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 44(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 32(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 36(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 24(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 28(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 16(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 20(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 8(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 12(%eax) +; FALLBACK23-NEXT: movl %edi, 4(%eax) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, (%eax) +; FALLBACK23-NEXT: addl $204, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: shl_64bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $204, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK24-NEXT: movl (%eax), %eax +; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: andl $60, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: subl %edx, %ecx +; FALLBACK24-NEXT: movl (%ecx), %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 4(%ecx), %edx +; FALLBACK24-NEXT: movl %ecx, %ebp +; FALLBACK24-NEXT: shll $3, %eax +; FALLBACK24-NEXT: andl $24, %eax +; FALLBACK24-NEXT: movl %edx, %esi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %al, %ch +; FALLBACK24-NEXT: notb %ch +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %esi, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 12(%ebp), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl 8(%ebp), %esi +; FALLBACK24-NEXT: movl %ebp, %edi +; FALLBACK24-NEXT: movl %esi, %ebp +; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: orl %esi, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edi, %ebp +; FALLBACK24-NEXT: movl 20(%edi), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl 16(%edi), %esi +; FALLBACK24-NEXT: movl %esi, %edx +; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: orl %ebx, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %esi, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %ebp, %edx +; FALLBACK24-NEXT: movl 28(%ebp), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl 24(%ebp), %esi +; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl %esi, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 36(%edx), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl 32(%edx), %esi +; FALLBACK24-NEXT: movl %edx, %ebp +; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: orl %esi, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 44(%ebp), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl 40(%ebp), %esi +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %esi, %edx +; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: orl %ebx, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: orl %esi, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 52(%ebp), %esi +; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: negl %edx +; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: shrl %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: orl %ebx, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: movl 60(%edi), %edx +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl 56(%edi), %ebx +; FALLBACK24-NEXT: movl %ebx, %edi +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %edx, %edi +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl %edx, (%eax) +; FALLBACK24-NEXT: movl %esi, 56(%eax) +; FALLBACK24-NEXT: movl %edi, 60(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 48(%eax) +; FALLBACK24-NEXT: movl %ebp, 52(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 40(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 44(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 32(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 36(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 24(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 28(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 16(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 20(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 8(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 12(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 4(%eax) +; FALLBACK24-NEXT: addl $204, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: vzeroupper +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: shl_64bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $188, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK25-NEXT: movl (%eax), %ecx +; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %ecx, %ebp +; FALLBACK25-NEXT: andl $60, %ebp +; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: subl %ebp, %eax +; FALLBACK25-NEXT: movl 8(%eax), %esi +; FALLBACK25-NEXT: movl 12(%eax), %edx +; FALLBACK25-NEXT: shll $3, %ecx +; FALLBACK25-NEXT: andl $24, %ecx +; FALLBACK25-NEXT: movl %edx, %edi +; FALLBACK25-NEXT: shldl %cl, %esi, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 4(%eax), %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 16(%eax), %edi +; FALLBACK25-NEXT: movl 20(%eax), %esi +; FALLBACK25-NEXT: movl %esi, %ebx +; FALLBACK25-NEXT: shldl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %edx, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 24(%eax), %edi +; FALLBACK25-NEXT: movl 28(%eax), %edx +; FALLBACK25-NEXT: movl %edx, %ebx +; FALLBACK25-NEXT: shldl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %esi, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 32(%eax), %edi +; FALLBACK25-NEXT: movl 36(%eax), %esi +; FALLBACK25-NEXT: movl %esi, %ebx +; FALLBACK25-NEXT: shldl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %edx, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 40(%eax), %edx +; FALLBACK25-NEXT: movl 44(%eax), %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %edx, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shldl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 56(%eax), %edx +; FALLBACK25-NEXT: movl 60(%eax), %edi +; FALLBACK25-NEXT: shldl %cl, %edx, %edi +; FALLBACK25-NEXT: movl (%eax), %ebx +; FALLBACK25-NEXT: movl 52(%eax), %esi +; FALLBACK25-NEXT: shldl %cl, %esi, %edx +; FALLBACK25-NEXT: negl %ebp +; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %edx, 56(%ebp) +; FALLBACK25-NEXT: movl %edi, 60(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: shldl %cl, %ebx, %edx +; FALLBACK25-NEXT: shll %cl, %ebx +; FALLBACK25-NEXT: shldl %cl, %eax, %esi +; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK25-NEXT: shldl %cl, %edi, %eax +; FALLBACK25-NEXT: movl %eax, 48(%ebp) +; FALLBACK25-NEXT: movl %esi, 52(%ebp) +; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 40(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 44(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 32(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 36(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 24(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 12(%ebp) +; FALLBACK25-NEXT: movl %ebx, (%ebp) +; FALLBACK25-NEXT: movl %edx, 4(%ebp) +; FALLBACK25-NEXT: addl $188, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: vzeroupper +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: shl_64bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $204, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK26-NEXT: movl (%eax), %eax +; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: leal (,%eax,8), %edx +; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: andl $60, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi +; FALLBACK26-NEXT: subl %eax, %edi +; FALLBACK26-NEXT: movl (%edi), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 4(%edi), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi +; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 8(%edi), %esi +; FALLBACK26-NEXT: movl %esi, %ecx +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: movl 12(%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, %esi, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 16(%edi), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: movl 20(%edi), %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK26-NEXT: orl %eax, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 24(%edi), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: movl 28(%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 32(%edi), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: movl 36(%edi), %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK26-NEXT: orl %eax, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 40(%edi), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: movl 44(%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%edi), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 52(%edi), %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp +; FALLBACK26-NEXT: orl %eax, %ebp +; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: negl %eax +; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx +; FALLBACK26-NEXT: movl 56(%edi), %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %edx +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %edx, %esi +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %eax, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK26-NEXT: movl %edx, (%eax) +; FALLBACK26-NEXT: movl %esi, 56(%eax) +; FALLBACK26-NEXT: movl %ecx, 60(%eax) +; FALLBACK26-NEXT: movl %ebp, 48(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 52(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 40(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 44(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 32(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 36(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 24(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 28(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 16(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 20(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 8(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 12(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 4(%eax) +; FALLBACK26-NEXT: addl $204, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: vzeroupper +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: shl_64bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $204, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 +; FALLBACK27-NEXT: movl (%eax), %ebx +; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: leal (,%ebx,8), %ecx +; FALLBACK27-NEXT: andl $24, %ecx +; FALLBACK27-NEXT: andl $60, %ebx +; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: subl %ebx, %eax +; FALLBACK27-NEXT: movl 4(%eax), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 8(%eax), %edi +; FALLBACK27-NEXT: movl 12(%eax), %edx +; FALLBACK27-NEXT: movl %edx, %ebp +; FALLBACK27-NEXT: shldl %cl, %edi, %ebp +; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %esi, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 16(%eax), %edi +; FALLBACK27-NEXT: movl 20(%eax), %esi +; FALLBACK27-NEXT: movl %esi, %ebp +; FALLBACK27-NEXT: shldl %cl, %edi, %ebp +; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %edx, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 24(%eax), %edi +; FALLBACK27-NEXT: movl 28(%eax), %edx +; FALLBACK27-NEXT: movl %edx, %ebp +; FALLBACK27-NEXT: shldl %cl, %edi, %ebp +; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %esi, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 32(%eax), %edi +; FALLBACK27-NEXT: movl 36(%eax), %esi +; FALLBACK27-NEXT: movl %esi, %ebp +; FALLBACK27-NEXT: shldl %cl, %edi, %ebp +; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %edx, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 40(%eax), %ebp +; FALLBACK27-NEXT: movl 44(%eax), %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %ebp, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shldl %cl, %esi, %ebp +; FALLBACK27-NEXT: movl 56(%eax), %edx +; FALLBACK27-NEXT: movl 60(%eax), %edi +; FALLBACK27-NEXT: shldl %cl, %edx, %edi +; FALLBACK27-NEXT: movl (%eax), %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 52(%eax), %esi +; FALLBACK27-NEXT: shldl %cl, %esi, %edx +; FALLBACK27-NEXT: negl %ebx +; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: movl %edx, 56(%eax) +; FALLBACK27-NEXT: movl %edi, 60(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK27-NEXT: shldl %cl, %edx, %edi +; FALLBACK27-NEXT: shldl %cl, %ebx, %esi +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: shldl %cl, %edx, %ebx +; FALLBACK27-NEXT: movl %ebx, 48(%eax) +; FALLBACK27-NEXT: movl %esi, 52(%eax) +; FALLBACK27-NEXT: movl %ebp, 40(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 44(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 32(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 36(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 24(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 28(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 16(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 20(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 8(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 12(%eax) +; FALLBACK27-NEXT: movl %edi, 4(%eax) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, (%eax) +; FALLBACK27-NEXT: addl $204, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: vzeroupper +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: shl_64bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $204, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK28-NEXT: movl (%eax), %eax +; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: andl $60, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: subl %edx, %ecx +; FALLBACK28-NEXT: movl (%ecx), %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 4(%ecx), %edx +; FALLBACK28-NEXT: movl %ecx, %ebp +; FALLBACK28-NEXT: shll $3, %eax +; FALLBACK28-NEXT: andl $24, %eax +; FALLBACK28-NEXT: movl %edx, %esi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %al, %ch +; FALLBACK28-NEXT: notb %ch +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %esi, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 12(%ebp), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl 8(%ebp), %esi +; FALLBACK28-NEXT: movl %ebp, %edi +; FALLBACK28-NEXT: movl %esi, %ebp +; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: orl %esi, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edi, %ebp +; FALLBACK28-NEXT: movl 20(%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl 16(%edi), %esi +; FALLBACK28-NEXT: movl %esi, %edx +; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: orl %ebx, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %esi, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %ebp, %edx +; FALLBACK28-NEXT: movl 28(%ebp), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl 24(%ebp), %esi +; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl %esi, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 36(%edx), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl 32(%edx), %esi +; FALLBACK28-NEXT: movl %edx, %ebp +; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: orl %esi, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 44(%ebp), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl 40(%ebp), %esi +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %esi, %edx +; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: orl %ebx, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: orl %esi, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 52(%ebp), %esi +; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: negl %edx +; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: shrl %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: orl %ebx, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: movl 60(%edi), %edx +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl 56(%edi), %ebx +; FALLBACK28-NEXT: movl %ebx, %edi +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %edx, %edi +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl %edx, (%eax) +; FALLBACK28-NEXT: movl %esi, 56(%eax) +; FALLBACK28-NEXT: movl %edi, 60(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 48(%eax) +; FALLBACK28-NEXT: movl %ebp, 52(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 40(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 44(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 32(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 36(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 24(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 28(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 16(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 20(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 8(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 12(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 4(%eax) +; FALLBACK28-NEXT: addl $204, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: vzeroupper +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: shl_64bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $188, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK29-NEXT: movl (%eax), %ecx +; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %ecx, %ebp +; FALLBACK29-NEXT: andl $60, %ebp +; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: subl %ebp, %eax +; FALLBACK29-NEXT: movl 8(%eax), %esi +; FALLBACK29-NEXT: movl 12(%eax), %edx +; FALLBACK29-NEXT: shll $3, %ecx +; FALLBACK29-NEXT: andl $24, %ecx +; FALLBACK29-NEXT: movl %edx, %edi +; FALLBACK29-NEXT: shldl %cl, %esi, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 4(%eax), %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 16(%eax), %edi +; FALLBACK29-NEXT: movl 20(%eax), %esi +; FALLBACK29-NEXT: movl %esi, %ebx +; FALLBACK29-NEXT: shldl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %edx, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 24(%eax), %edi +; FALLBACK29-NEXT: movl 28(%eax), %edx +; FALLBACK29-NEXT: movl %edx, %ebx +; FALLBACK29-NEXT: shldl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %esi, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 32(%eax), %edi +; FALLBACK29-NEXT: movl 36(%eax), %esi +; FALLBACK29-NEXT: movl %esi, %ebx +; FALLBACK29-NEXT: shldl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %edx, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 40(%eax), %edx +; FALLBACK29-NEXT: movl 44(%eax), %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %edx, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shldl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 56(%eax), %edx +; FALLBACK29-NEXT: movl 60(%eax), %edi +; FALLBACK29-NEXT: shldl %cl, %edx, %edi +; FALLBACK29-NEXT: movl (%eax), %ebx +; FALLBACK29-NEXT: movl 52(%eax), %esi +; FALLBACK29-NEXT: shldl %cl, %esi, %edx +; FALLBACK29-NEXT: negl %ebp +; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %edx, 56(%ebp) +; FALLBACK29-NEXT: movl %edi, 60(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: shldl %cl, %ebx, %edx +; FALLBACK29-NEXT: shll %cl, %ebx +; FALLBACK29-NEXT: shldl %cl, %eax, %esi +; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK29-NEXT: shldl %cl, %edi, %eax +; FALLBACK29-NEXT: movl %eax, 48(%ebp) +; FALLBACK29-NEXT: movl %esi, 52(%ebp) +; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 40(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 44(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 32(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 36(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 24(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 12(%ebp) +; FALLBACK29-NEXT: movl %ebx, (%ebp) +; FALLBACK29-NEXT: movl %edx, 4(%ebp) +; FALLBACK29-NEXT: addl $188, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: vzeroupper +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: shl_64bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $204, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK30-NEXT: movl (%eax), %eax +; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: leal (,%eax,8), %edx +; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: andl $60, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi +; FALLBACK30-NEXT: subl %eax, %edi +; FALLBACK30-NEXT: movl (%edi), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 4(%edi), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi +; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 8(%edi), %esi +; FALLBACK30-NEXT: movl %esi, %ecx +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 12(%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, %esi, %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 16(%edi), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK30-NEXT: movl 20(%edi), %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: orl %eax, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%edi), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 28(%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 32(%edi), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK30-NEXT: movl 36(%edi), %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: orl %eax, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 40(%edi), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 44(%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%edi), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 52(%edi), %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp +; FALLBACK30-NEXT: orl %eax, %ebp +; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: negl %eax +; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx +; FALLBACK30-NEXT: movl 56(%edi), %eax +; FALLBACK30-NEXT: shlxl %edx, %eax, %edx +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %edx, %esi +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK30-NEXT: orl %eax, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK30-NEXT: movl %edx, (%eax) +; FALLBACK30-NEXT: movl %esi, 56(%eax) +; FALLBACK30-NEXT: movl %ecx, 60(%eax) +; FALLBACK30-NEXT: movl %ebp, 48(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 52(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 40(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 44(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 32(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 36(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 24(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 28(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 16(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 20(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 8(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 12(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 4(%eax) +; FALLBACK30-NEXT: addl $204, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: vzeroupper +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: shl_64bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $204, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 +; FALLBACK31-NEXT: movl (%eax), %ebx +; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: leal (,%ebx,8), %ecx +; FALLBACK31-NEXT: andl $24, %ecx +; FALLBACK31-NEXT: andl $60, %ebx +; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: subl %ebx, %eax +; FALLBACK31-NEXT: movl 4(%eax), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 8(%eax), %edi +; FALLBACK31-NEXT: movl 12(%eax), %edx +; FALLBACK31-NEXT: movl %edx, %ebp +; FALLBACK31-NEXT: shldl %cl, %edi, %ebp +; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %esi, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 16(%eax), %edi +; FALLBACK31-NEXT: movl 20(%eax), %esi +; FALLBACK31-NEXT: movl %esi, %ebp +; FALLBACK31-NEXT: shldl %cl, %edi, %ebp +; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %edx, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 24(%eax), %edi +; FALLBACK31-NEXT: movl 28(%eax), %edx +; FALLBACK31-NEXT: movl %edx, %ebp +; FALLBACK31-NEXT: shldl %cl, %edi, %ebp +; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %esi, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 32(%eax), %edi +; FALLBACK31-NEXT: movl 36(%eax), %esi +; FALLBACK31-NEXT: movl %esi, %ebp +; FALLBACK31-NEXT: shldl %cl, %edi, %ebp +; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %edx, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 40(%eax), %ebp +; FALLBACK31-NEXT: movl 44(%eax), %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %ebp, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shldl %cl, %esi, %ebp +; FALLBACK31-NEXT: movl 56(%eax), %edx +; FALLBACK31-NEXT: movl 60(%eax), %edi +; FALLBACK31-NEXT: shldl %cl, %edx, %edi +; FALLBACK31-NEXT: movl (%eax), %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 52(%eax), %esi +; FALLBACK31-NEXT: shldl %cl, %esi, %edx +; FALLBACK31-NEXT: negl %ebx +; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: movl %edx, 56(%eax) +; FALLBACK31-NEXT: movl %edi, 60(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK31-NEXT: shldl %cl, %edx, %edi +; FALLBACK31-NEXT: shldl %cl, %ebx, %esi +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: shldl %cl, %edx, %ebx +; FALLBACK31-NEXT: movl %ebx, 48(%eax) +; FALLBACK31-NEXT: movl %esi, 52(%eax) +; FALLBACK31-NEXT: movl %ebp, 40(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 44(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 32(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 36(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 24(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 28(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 16(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 20(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 8(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 12(%eax) +; FALLBACK31-NEXT: movl %edi, 4(%eax) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, (%eax) +; FALLBACK31-NEXT: addl $204, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: vzeroupper +; FALLBACK31-NEXT: retl + %src = load i512, ptr %src.ptr, align 1 + %byteOff = load i512, ptr %byteOff.ptr, align 1 + %bitOff = shl i512 %byteOff, 3 + %res = shl i512 %src, %bitOff + store i512 %res, ptr %dst, align 1 + ret void +} + +define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { +; X64-SSE2-LABEL: shl_64bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rbx ; X64-SSE2-NEXT: movq (%rdi), %rax @@ -2012,6 +19811,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq 48(%rdi), %rbx ; X64-SSE2-NEXT: movq 56(%rdi), %rdi ; X64-SSE2-NEXT: movl (%rsi), %esi +; X64-SSE2-NEXT: xorps %xmm0, %xmm0 +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -2020,15 +19824,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $63, %esi +; X64-SSE2-NEXT: shll $3, %esi +; X64-SSE2-NEXT: andl $56, %esi ; X64-SSE2-NEXT: negl %esi ; X64-SSE2-NEXT: movslq %esi, %rax ; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx @@ -2050,23 +19847,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; -; X64-SSE42-LABEL: shl_64bytes: +; X64-SSE42-LABEL: shl_64bytes_qwordOff: ; X64-SSE42: # %bb.0: +; X64-SSE42-NEXT: pushq %rax ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 ; X64-SSE42-NEXT: movups 48(%rdi), %xmm3 ; X64-SSE42-NEXT: movl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm4, %xmm4 -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: andl $63, %eax +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: shll $3, %eax +; X64-SSE42-NEXT: andl $56, %eax ; X64-SSE42-NEXT: negl %eax ; X64-SSE42-NEXT: cltq ; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0 @@ -2077,10 +19876,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) +; X64-SSE42-NEXT: popq %rax ; X64-SSE42-NEXT: retq ; -; X64-AVX1-LABEL: shl_64bytes: +; X64-AVX1-LABEL: shl_64bytes_qwordOff: ; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 ; X64-AVX1-NEXT: movl (%rsi), %eax @@ -2089,7 +19890,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX1-NEXT: andl $63, %eax +; X64-AVX1-NEXT: shll $3, %eax +; X64-AVX1-NEXT: andl $56, %eax ; X64-AVX1-NEXT: negl %eax ; X64-AVX1-NEXT: cltq ; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0 @@ -2100,17 +19902,20 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: popq %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; -; X64-AVX512-LABEL: shl_64bytes: +; X64-AVX512-LABEL: shl_64bytes_qwordOff: ; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: pushq %rax ; X64-AVX512-NEXT: vmovups (%rdi), %zmm0 ; X64-AVX512-NEXT: movl (%rsi), %eax ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; X64-AVX512-NEXT: andl $63, %eax +; X64-AVX512-NEXT: shll $3, %eax +; X64-AVX512-NEXT: andl $56, %eax ; X64-AVX512-NEXT: negl %eax ; X64-AVX512-NEXT: cltq ; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0 @@ -2121,117 +19926,108 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: popq %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; -; X86-SSE2-LABEL: shl_64bytes: +; X86-SSE2-LABEL: shl_64bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $168, %esp -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 40(%eax), %ebp -; X86-SSE2-NEXT: movl 44(%eax), %ebx -; X86-SSE2-NEXT: movl 48(%eax), %edi -; X86-SSE2-NEXT: movl 52(%eax), %esi -; X86-SSE2-NEXT: movl 56(%eax), %edx -; X86-SSE2-NEXT: movl 60(%eax), %ecx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl (%eax), %eax -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: subl $188, %esp +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl (%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 4(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 8(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 12(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 16(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 20(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 24(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 28(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 32(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 36(%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-SSE2-NEXT: movl 48(%ecx), %edi +; X86-SSE2-NEXT: movl 52(%ecx), %esi +; X86-SSE2-NEXT: movl 56(%ecx), %edx +; X86-SSE2-NEXT: movl 60(%ecx), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movl (%ecx), %ecx +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $63, %eax -; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: subl %eax, %ecx -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%ecx), %edx +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE2-NEXT: shll $3, %ecx +; X86-SSE2-NEXT: andl $56, %ecx +; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: subl %ecx, %eax +; X86-SSE2-NEXT: movl (%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%ecx), %edx +; X86-SSE2-NEXT: movl 4(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%ecx), %edx +; X86-SSE2-NEXT: movl 12(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%ecx), %edx +; X86-SSE2-NEXT: movl 8(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%ecx), %edx +; X86-SSE2-NEXT: movl 20(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%ecx), %edx +; X86-SSE2-NEXT: movl 16(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%ecx), %edx +; X86-SSE2-NEXT: movl 28(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%ecx), %edx +; X86-SSE2-NEXT: movl 24(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%ecx), %edx +; X86-SSE2-NEXT: movl 36(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%ecx), %ebp -; X86-SSE2-NEXT: movl 40(%ecx), %ebx -; X86-SSE2-NEXT: movl 52(%ecx), %edi -; X86-SSE2-NEXT: movl 60(%ecx), %esi -; X86-SSE2-NEXT: movl 56(%ecx), %edx -; X86-SSE2-NEXT: negl %eax -; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 32(%eax), %edx +; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 44(%eax), %ebp +; X86-SSE2-NEXT: movl 40(%eax), %ebx +; X86-SSE2-NEXT: movl 52(%eax), %edi +; X86-SSE2-NEXT: movl 60(%eax), %esi +; X86-SSE2-NEXT: movl 56(%eax), %edx +; X86-SSE2-NEXT: negl %ecx +; X86-SSE2-NEXT: movl 160(%esp,%ecx), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %edx, 56(%eax) ; X86-SSE2-NEXT: movl %esi, 60(%eax) @@ -2239,7 +20035,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) @@ -2259,16 +20055,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $168, %esp +; X86-SSE2-NEXT: addl $188, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: shl_64bytes: +; X86-SSE42-LABEL: shl_64bytes_qwordOff: ; X86-SSE42: # %bb.0: -; X86-SSE42-NEXT: subl $128, %esp +; X86-SSE42-NEXT: subl $140, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2278,15 +20074,16 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movups 48(%edx), %xmm3 ; X86-SSE42-NEXT: movl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm4, %xmm4 -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm4, (%esp) -; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: andl $63, %ecx +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm4, (%esp) +; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: shll $3, %ecx +; X86-SSE42-NEXT: andl $56, %ecx ; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: subl %ecx, %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 @@ -2298,12 +20095,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movups %xmm2, 32(%eax) ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) -; X86-SSE42-NEXT: addl $128, %esp +; X86-SSE42-NEXT: addl $140, %esp ; X86-SSE42-NEXT: retl ; -; X86-AVX1-LABEL: shl_64bytes: +; X86-AVX1-LABEL: shl_64bytes_qwordOff: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: subl $128, %esp +; X86-AVX1-NEXT: subl $140, %esp ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2315,7 +20112,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX1-NEXT: vmovups %ymm2, (%esp) ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; X86-AVX1-NEXT: andl $63, %ecx +; X86-AVX1-NEXT: shll $3, %ecx +; X86-AVX1-NEXT: andl $56, %ecx ; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: subl %ecx, %edx ; X86-AVX1-NEXT: vmovups (%edx), %xmm0 @@ -2327,13 +20125,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX1-NEXT: vmovups %xmm0, (%eax) -; X86-AVX1-NEXT: addl $128, %esp +; X86-AVX1-NEXT: addl $140, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; -; X86-AVX512-LABEL: shl_64bytes: +; X86-AVX512-LABEL: shl_64bytes_qwordOff: ; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: subl $128, %esp +; X86-AVX512-NEXT: subl $140, %esp ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2342,7 +20140,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vmovups %zmm1, (%esp) ; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; X86-AVX512-NEXT: andl $63, %ecx +; X86-AVX512-NEXT: shll $3, %ecx +; X86-AVX512-NEXT: andl $56, %ecx ; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-AVX512-NEXT: subl %ecx, %edx ; X86-AVX512-NEXT: vmovups (%edx), %xmm0 @@ -2354,18 +20153,4121 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX512-NEXT: vmovups %xmm0, (%eax) -; X86-AVX512-NEXT: addl $128, %esp +; X86-AVX512-NEXT: addl $140, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 - %byteOff = load i512, ptr %byteOff.ptr, align 1 - %bitOff = shl i512 %byteOff, 3 + %qwordOff = load i512, ptr %qwordOff.ptr, align 1 + %bitOff = shl i512 %qwordOff, 6 %res = shl i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } + define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { -; X64-SSE2-LABEL: ashr_64bytes: +; FALLBACK0-LABEL: ashr_64bytes: +; FALLBACK0: # %bb.0: +; FALLBACK0-NEXT: pushq %r15 +; FALLBACK0-NEXT: pushq %r14 +; FALLBACK0-NEXT: pushq %r13 +; FALLBACK0-NEXT: pushq %r12 +; FALLBACK0-NEXT: pushq %rbx +; FALLBACK0-NEXT: movq (%rdi), %rax +; FALLBACK0-NEXT: movq 8(%rdi), %rcx +; FALLBACK0-NEXT: movq 16(%rdi), %r8 +; FALLBACK0-NEXT: movq 24(%rdi), %r9 +; FALLBACK0-NEXT: movq 32(%rdi), %r10 +; FALLBACK0-NEXT: movq 40(%rdi), %r11 +; FALLBACK0-NEXT: movq 48(%rdi), %rbx +; FALLBACK0-NEXT: movq 56(%rdi), %r14 +; FALLBACK0-NEXT: movl (%rsi), %edi +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: sarq $63, %r14 +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK0-NEXT: leal (,%rdi,8), %eax +; FALLBACK0-NEXT: andl $56, %eax +; FALLBACK0-NEXT: andl $56, %edi +; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 +; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 +; FALLBACK0-NEXT: movq %r8, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r11 +; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: notb %sil +; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx +; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r9 +; FALLBACK0-NEXT: orq %r11, %r9 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r10 +; FALLBACK0-NEXT: addq %r8, %r8 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r8 +; FALLBACK0-NEXT: orq %r10, %r8 +; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 +; FALLBACK0-NEXT: movq %r10, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r15 +; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 +; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r11 +; FALLBACK0-NEXT: orq %r15, %r11 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %rbx +; FALLBACK0-NEXT: addq %r10, %r10 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r10 +; FALLBACK0-NEXT: orq %rbx, %r10 +; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx +; FALLBACK0-NEXT: movq %rbx, %r12 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r12 +; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 +; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r15 +; FALLBACK0-NEXT: orq %r12, %r15 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r14 +; FALLBACK0-NEXT: addq %rbx, %rbx +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %rbx +; FALLBACK0-NEXT: orq %r14, %rbx +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: shrq %cl, %r13 +; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi +; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 +; FALLBACK0-NEXT: movl %esi, %ecx +; FALLBACK0-NEXT: shlq %cl, %r14 +; FALLBACK0-NEXT: orq %r13, %r14 +; FALLBACK0-NEXT: movl %eax, %ecx +; FALLBACK0-NEXT: sarq %cl, %rdi +; FALLBACK0-NEXT: movq %rdi, 56(%rdx) +; FALLBACK0-NEXT: movq %r14, 48(%rdx) +; FALLBACK0-NEXT: movq %rbx, 32(%rdx) +; FALLBACK0-NEXT: movq %r15, 40(%rdx) +; FALLBACK0-NEXT: movq %r10, 16(%rdx) +; FALLBACK0-NEXT: movq %r11, 24(%rdx) +; FALLBACK0-NEXT: movq %r8, (%rdx) +; FALLBACK0-NEXT: movq %r9, 8(%rdx) +; FALLBACK0-NEXT: popq %rbx +; FALLBACK0-NEXT: popq %r12 +; FALLBACK0-NEXT: popq %r13 +; FALLBACK0-NEXT: popq %r14 +; FALLBACK0-NEXT: popq %r15 +; FALLBACK0-NEXT: retq +; +; FALLBACK1-LABEL: ashr_64bytes: +; FALLBACK1: # %bb.0: +; FALLBACK1-NEXT: pushq %r15 +; FALLBACK1-NEXT: pushq %r14 +; FALLBACK1-NEXT: pushq %rbx +; FALLBACK1-NEXT: movq (%rdi), %rcx +; FALLBACK1-NEXT: movq 8(%rdi), %r8 +; FALLBACK1-NEXT: movq 16(%rdi), %r9 +; FALLBACK1-NEXT: movq 24(%rdi), %r10 +; FALLBACK1-NEXT: movq 32(%rdi), %r11 +; FALLBACK1-NEXT: movq 40(%rdi), %rbx +; FALLBACK1-NEXT: movq 48(%rdi), %r14 +; FALLBACK1-NEXT: movq 56(%rdi), %rdi +; FALLBACK1-NEXT: movl (%rsi), %eax +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: sarq $63, %rdi +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK1-NEXT: leal (,%rax,8), %ecx +; FALLBACK1-NEXT: andl $56, %ecx +; FALLBACK1-NEXT: andl $56, %eax +; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi +; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi +; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 +; FALLBACK1-NEXT: movq %r9, %r8 +; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 +; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 +; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK1-NEXT: movq %r11, %rbx +; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 +; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK1-NEXT: movq %r14, %r15 +; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 +; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 +; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 +; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK1-NEXT: sarq %cl, %rax +; FALLBACK1-NEXT: movq %r11, 48(%rdx) +; FALLBACK1-NEXT: movq %rax, 56(%rdx) +; FALLBACK1-NEXT: movq %r10, 32(%rdx) +; FALLBACK1-NEXT: movq %r15, 40(%rdx) +; FALLBACK1-NEXT: movq %rdi, 16(%rdx) +; FALLBACK1-NEXT: movq %rbx, 24(%rdx) +; FALLBACK1-NEXT: movq %rsi, (%rdx) +; FALLBACK1-NEXT: movq %r8, 8(%rdx) +; FALLBACK1-NEXT: popq %rbx +; FALLBACK1-NEXT: popq %r14 +; FALLBACK1-NEXT: popq %r15 +; FALLBACK1-NEXT: retq +; +; FALLBACK2-LABEL: ashr_64bytes: +; FALLBACK2: # %bb.0: +; FALLBACK2-NEXT: pushq %rbp +; FALLBACK2-NEXT: pushq %r15 +; FALLBACK2-NEXT: pushq %r14 +; FALLBACK2-NEXT: pushq %r13 +; FALLBACK2-NEXT: pushq %r12 +; FALLBACK2-NEXT: pushq %rbx +; FALLBACK2-NEXT: pushq %rax +; FALLBACK2-NEXT: movq (%rdi), %rcx +; FALLBACK2-NEXT: movq 8(%rdi), %r8 +; FALLBACK2-NEXT: movq 16(%rdi), %r9 +; FALLBACK2-NEXT: movq 24(%rdi), %r10 +; FALLBACK2-NEXT: movq 32(%rdi), %r11 +; FALLBACK2-NEXT: movq 40(%rdi), %rbx +; FALLBACK2-NEXT: movq 48(%rdi), %r14 +; FALLBACK2-NEXT: movq 56(%rdi), %rdi +; FALLBACK2-NEXT: movl (%rsi), %eax +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: sarq $63, %rdi +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: leal (,%rax,8), %ecx +; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx +; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 +; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp +; FALLBACK2-NEXT: movl %ecx, %r12d +; FALLBACK2-NEXT: notb %r12b +; FALLBACK2-NEXT: addq %r9, %r9 +; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: orq %rbx, %r9 +; FALLBACK2-NEXT: addq %rdi, %rdi +; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi +; FALLBACK2-NEXT: orq %r13, %rdi +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx +; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: addq %r10, %r10 +; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 +; FALLBACK2-NEXT: orq %r8, %r10 +; FALLBACK2-NEXT: addq %rsi, %rsi +; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi +; FALLBACK2-NEXT: orq %r11, %rsi +; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 +; FALLBACK2-NEXT: orq %r15, %r8 +; FALLBACK2-NEXT: addq %r14, %r14 +; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 +; FALLBACK2-NEXT: orq %rbp, %r11 +; FALLBACK2-NEXT: addq %rax, %rax +; FALLBACK2-NEXT: shlxq %r12, %rax, %rax +; FALLBACK2-NEXT: orq %r13, %rax +; FALLBACK2-NEXT: movq %rcx, 56(%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) +; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r8, 40(%rdx) +; FALLBACK2-NEXT: movq %rsi, 16(%rdx) +; FALLBACK2-NEXT: movq %r10, 24(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: addq $8, %rsp +; FALLBACK2-NEXT: popq %rbx +; FALLBACK2-NEXT: popq %r12 +; FALLBACK2-NEXT: popq %r13 +; FALLBACK2-NEXT: popq %r14 +; FALLBACK2-NEXT: popq %r15 +; FALLBACK2-NEXT: popq %rbp +; FALLBACK2-NEXT: retq +; +; FALLBACK3-LABEL: ashr_64bytes: +; FALLBACK3: # %bb.0: +; FALLBACK3-NEXT: pushq %r15 +; FALLBACK3-NEXT: pushq %r14 +; FALLBACK3-NEXT: pushq %rbx +; FALLBACK3-NEXT: movq (%rdi), %rcx +; FALLBACK3-NEXT: movq 8(%rdi), %r8 +; FALLBACK3-NEXT: movq 16(%rdi), %r9 +; FALLBACK3-NEXT: movq 24(%rdi), %r10 +; FALLBACK3-NEXT: movq 32(%rdi), %r11 +; FALLBACK3-NEXT: movq 40(%rdi), %rbx +; FALLBACK3-NEXT: movq 48(%rdi), %r14 +; FALLBACK3-NEXT: movq 56(%rdi), %rdi +; FALLBACK3-NEXT: movl (%rsi), %eax +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: sarq $63, %rdi +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK3-NEXT: leal (,%rax,8), %ecx +; FALLBACK3-NEXT: andl $56, %ecx +; FALLBACK3-NEXT: andl $56, %eax +; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi +; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi +; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 +; FALLBACK3-NEXT: movq %r9, %r8 +; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 +; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 +; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK3-NEXT: movq %r11, %rbx +; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx +; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 +; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 +; FALLBACK3-NEXT: movq %r14, %r15 +; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 +; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 +; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 +; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax +; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK3-NEXT: movq %r11, 48(%rdx) +; FALLBACK3-NEXT: movq %r10, 32(%rdx) +; FALLBACK3-NEXT: movq %r15, 40(%rdx) +; FALLBACK3-NEXT: movq %rdi, 16(%rdx) +; FALLBACK3-NEXT: movq %rbx, 24(%rdx) +; FALLBACK3-NEXT: movq %rsi, (%rdx) +; FALLBACK3-NEXT: movq %r8, 8(%rdx) +; FALLBACK3-NEXT: movq %rax, 56(%rdx) +; FALLBACK3-NEXT: popq %rbx +; FALLBACK3-NEXT: popq %r14 +; FALLBACK3-NEXT: popq %r15 +; FALLBACK3-NEXT: retq +; +; FALLBACK4-LABEL: ashr_64bytes: +; FALLBACK4: # %bb.0: +; FALLBACK4-NEXT: pushq %rbp +; FALLBACK4-NEXT: pushq %r15 +; FALLBACK4-NEXT: pushq %r14 +; FALLBACK4-NEXT: pushq %r13 +; FALLBACK4-NEXT: pushq %r12 +; FALLBACK4-NEXT: pushq %rbx +; FALLBACK4-NEXT: pushq %rax +; FALLBACK4-NEXT: movups (%rdi), %xmm0 +; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK4-NEXT: movq 48(%rdi), %rax +; FALLBACK4-NEXT: movq 56(%rdi), %rcx +; FALLBACK4-NEXT: movl (%rsi), %edi +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: sarq $63, %rcx +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK4-NEXT: leal (,%rdi,8), %eax +; FALLBACK4-NEXT: andl $56, %eax +; FALLBACK4-NEXT: andl $56, %edi +; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10 +; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r10 +; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: notb %sil +; FALLBACK4-NEXT: leaq (%r9,%r9), %r8 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r8 +; FALLBACK4-NEXT: orq %r10, %r8 +; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10 +; FALLBACK4-NEXT: movq %r10, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %rbx +; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12 +; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r11 +; FALLBACK4-NEXT: orq %rbx, %r11 +; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx +; FALLBACK4-NEXT: movq %rbx, %r14 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r14 +; FALLBACK4-NEXT: addq %r10, %r10 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r10 +; FALLBACK4-NEXT: orq %r14, %r10 +; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14 +; FALLBACK4-NEXT: movq %r14, %r13 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r13 +; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp +; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r15 +; FALLBACK4-NEXT: orq %r13, %r15 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r12 +; FALLBACK4-NEXT: addq %r14, %r14 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r14 +; FALLBACK4-NEXT: orq %r12, %r14 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %rbp +; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi +; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12 +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %r12 +; FALLBACK4-NEXT: orq %rbp, %r12 +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: shrq %cl, %r9 +; FALLBACK4-NEXT: addq %rbx, %rbx +; FALLBACK4-NEXT: movl %esi, %ecx +; FALLBACK4-NEXT: shlq %cl, %rbx +; FALLBACK4-NEXT: orq %r9, %rbx +; FALLBACK4-NEXT: movl %eax, %ecx +; FALLBACK4-NEXT: sarq %cl, %rdi +; FALLBACK4-NEXT: movq %rdi, 56(%rdx) +; FALLBACK4-NEXT: movq %rbx, 8(%rdx) +; FALLBACK4-NEXT: movq %r12, 48(%rdx) +; FALLBACK4-NEXT: movq %r14, 32(%rdx) +; FALLBACK4-NEXT: movq %r15, 40(%rdx) +; FALLBACK4-NEXT: movq %r10, 16(%rdx) +; FALLBACK4-NEXT: movq %r11, 24(%rdx) +; FALLBACK4-NEXT: movq %r8, (%rdx) +; FALLBACK4-NEXT: addq $8, %rsp +; FALLBACK4-NEXT: popq %rbx +; FALLBACK4-NEXT: popq %r12 +; FALLBACK4-NEXT: popq %r13 +; FALLBACK4-NEXT: popq %r14 +; FALLBACK4-NEXT: popq %r15 +; FALLBACK4-NEXT: popq %rbp +; FALLBACK4-NEXT: retq +; +; FALLBACK5-LABEL: ashr_64bytes: +; FALLBACK5: # %bb.0: +; FALLBACK5-NEXT: pushq %r15 +; FALLBACK5-NEXT: pushq %r14 +; FALLBACK5-NEXT: pushq %rbx +; FALLBACK5-NEXT: movups (%rdi), %xmm0 +; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK5-NEXT: movq 48(%rdi), %rcx +; FALLBACK5-NEXT: movq 56(%rdi), %rdi +; FALLBACK5-NEXT: movl (%rsi), %eax +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: sarq $63, %rdi +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK5-NEXT: leal (,%rax,8), %ecx +; FALLBACK5-NEXT: andl $56, %ecx +; FALLBACK5-NEXT: andl $56, %eax +; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq %r9, %rsi +; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK5-NEXT: movq %r10, %r8 +; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK5-NEXT: movq %r11, %rbx +; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK5-NEXT: movq %rax, %r15 +; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK5-NEXT: sarq %cl, %r11 +; FALLBACK5-NEXT: movq %r15, 8(%rdx) +; FALLBACK5-NEXT: movq %r9, 48(%rdx) +; FALLBACK5-NEXT: movq %r11, 56(%rdx) +; FALLBACK5-NEXT: movq %rdi, 32(%rdx) +; FALLBACK5-NEXT: movq %rbx, 40(%rdx) +; FALLBACK5-NEXT: movq %r8, 16(%rdx) +; FALLBACK5-NEXT: movq %rsi, 24(%rdx) +; FALLBACK5-NEXT: movq %r14, (%rdx) +; FALLBACK5-NEXT: popq %rbx +; FALLBACK5-NEXT: popq %r14 +; FALLBACK5-NEXT: popq %r15 +; FALLBACK5-NEXT: retq +; +; FALLBACK6-LABEL: ashr_64bytes: +; FALLBACK6: # %bb.0: +; FALLBACK6-NEXT: pushq %rbp +; FALLBACK6-NEXT: pushq %r15 +; FALLBACK6-NEXT: pushq %r14 +; FALLBACK6-NEXT: pushq %r13 +; FALLBACK6-NEXT: pushq %r12 +; FALLBACK6-NEXT: pushq %rbx +; FALLBACK6-NEXT: pushq %rax +; FALLBACK6-NEXT: movups (%rdi), %xmm0 +; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK6-NEXT: movq 48(%rdi), %rcx +; FALLBACK6-NEXT: movq 56(%rdi), %rdi +; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: sarq $63, %rdi +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: leal (,%rax,8), %esi +; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: andl $56, %eax +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK6-NEXT: movl %esi, %ebx +; FALLBACK6-NEXT: notb %bl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK6-NEXT: orq %r11, %r8 +; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi +; FALLBACK6-NEXT: addq %rdi, %rdi +; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r9, %rdi +; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK6-NEXT: orq %r14, %r9 +; FALLBACK6-NEXT: addq %r10, %r10 +; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK6-NEXT: orq %r15, %r10 +; FALLBACK6-NEXT: addq %rax, %rax +; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK6-NEXT: orq %r13, %rax +; FALLBACK6-NEXT: addq %rcx, %rcx +; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK6-NEXT: orq %rbp, %rcx +; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: movq %rcx, 8(%rdx) +; FALLBACK6-NEXT: movq %rax, 48(%rdx) +; FALLBACK6-NEXT: movq %r10, 32(%rdx) +; FALLBACK6-NEXT: movq %r9, 40(%rdx) +; FALLBACK6-NEXT: movq %rdi, 16(%rdx) +; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %r8, (%rdx) +; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: popq %rbx +; FALLBACK6-NEXT: popq %r12 +; FALLBACK6-NEXT: popq %r13 +; FALLBACK6-NEXT: popq %r14 +; FALLBACK6-NEXT: popq %r15 +; FALLBACK6-NEXT: popq %rbp +; FALLBACK6-NEXT: retq +; +; FALLBACK7-LABEL: ashr_64bytes: +; FALLBACK7: # %bb.0: +; FALLBACK7-NEXT: pushq %r15 +; FALLBACK7-NEXT: pushq %r14 +; FALLBACK7-NEXT: pushq %rbx +; FALLBACK7-NEXT: movups (%rdi), %xmm0 +; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 +; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 +; FALLBACK7-NEXT: movq 48(%rdi), %rcx +; FALLBACK7-NEXT: movq 56(%rdi), %rdi +; FALLBACK7-NEXT: movl (%rsi), %eax +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: sarq $63, %rdi +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK7-NEXT: leal (,%rax,8), %ecx +; FALLBACK7-NEXT: andl $56, %ecx +; FALLBACK7-NEXT: andl $56, %eax +; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq %r9, %rsi +; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK7-NEXT: movq %r10, %r8 +; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK7-NEXT: movq %r11, %rbx +; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK7-NEXT: movq %rax, %r15 +; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10 +; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK7-NEXT: movq %r15, 8(%rdx) +; FALLBACK7-NEXT: movq %r9, 48(%rdx) +; FALLBACK7-NEXT: movq %rdi, 32(%rdx) +; FALLBACK7-NEXT: movq %rbx, 40(%rdx) +; FALLBACK7-NEXT: movq %r8, 16(%rdx) +; FALLBACK7-NEXT: movq %rsi, 24(%rdx) +; FALLBACK7-NEXT: movq %r14, (%rdx) +; FALLBACK7-NEXT: movq %r10, 56(%rdx) +; FALLBACK7-NEXT: popq %rbx +; FALLBACK7-NEXT: popq %r14 +; FALLBACK7-NEXT: popq %r15 +; FALLBACK7-NEXT: retq +; +; FALLBACK8-LABEL: ashr_64bytes: +; FALLBACK8: # %bb.0: +; FALLBACK8-NEXT: pushq %rbp +; FALLBACK8-NEXT: pushq %r15 +; FALLBACK8-NEXT: pushq %r14 +; FALLBACK8-NEXT: pushq %r13 +; FALLBACK8-NEXT: pushq %r12 +; FALLBACK8-NEXT: pushq %rbx +; FALLBACK8-NEXT: pushq %rax +; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK8-NEXT: movq 48(%rdi), %rax +; FALLBACK8-NEXT: movq 56(%rdi), %rcx +; FALLBACK8-NEXT: movl (%rsi), %edi +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: sarq $63, %rcx +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK8-NEXT: leal (,%rdi,8), %eax +; FALLBACK8-NEXT: andl $56, %eax +; FALLBACK8-NEXT: andl $56, %edi +; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10 +; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r10 +; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: notb %sil +; FALLBACK8-NEXT: leaq (%r9,%r9), %r8 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r8 +; FALLBACK8-NEXT: orq %r10, %r8 +; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10 +; FALLBACK8-NEXT: movq %r10, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %rbx +; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12 +; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r11 +; FALLBACK8-NEXT: orq %rbx, %r11 +; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx +; FALLBACK8-NEXT: movq %rbx, %r14 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r14 +; FALLBACK8-NEXT: addq %r10, %r10 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r10 +; FALLBACK8-NEXT: orq %r14, %r10 +; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14 +; FALLBACK8-NEXT: movq %r14, %r13 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r13 +; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp +; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r15 +; FALLBACK8-NEXT: orq %r13, %r15 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r12 +; FALLBACK8-NEXT: addq %r14, %r14 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r14 +; FALLBACK8-NEXT: orq %r12, %r14 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %rbp +; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi +; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12 +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %r12 +; FALLBACK8-NEXT: orq %rbp, %r12 +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: shrq %cl, %r9 +; FALLBACK8-NEXT: addq %rbx, %rbx +; FALLBACK8-NEXT: movl %esi, %ecx +; FALLBACK8-NEXT: shlq %cl, %rbx +; FALLBACK8-NEXT: orq %r9, %rbx +; FALLBACK8-NEXT: movl %eax, %ecx +; FALLBACK8-NEXT: sarq %cl, %rdi +; FALLBACK8-NEXT: movq %rdi, 56(%rdx) +; FALLBACK8-NEXT: movq %rbx, 8(%rdx) +; FALLBACK8-NEXT: movq %r12, 48(%rdx) +; FALLBACK8-NEXT: movq %r14, 32(%rdx) +; FALLBACK8-NEXT: movq %r15, 40(%rdx) +; FALLBACK8-NEXT: movq %r10, 16(%rdx) +; FALLBACK8-NEXT: movq %r11, 24(%rdx) +; FALLBACK8-NEXT: movq %r8, (%rdx) +; FALLBACK8-NEXT: addq $8, %rsp +; FALLBACK8-NEXT: popq %rbx +; FALLBACK8-NEXT: popq %r12 +; FALLBACK8-NEXT: popq %r13 +; FALLBACK8-NEXT: popq %r14 +; FALLBACK8-NEXT: popq %r15 +; FALLBACK8-NEXT: popq %rbp +; FALLBACK8-NEXT: vzeroupper +; FALLBACK8-NEXT: retq +; +; FALLBACK9-LABEL: ashr_64bytes: +; FALLBACK9: # %bb.0: +; FALLBACK9-NEXT: pushq %r15 +; FALLBACK9-NEXT: pushq %r14 +; FALLBACK9-NEXT: pushq %rbx +; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK9-NEXT: movq 48(%rdi), %rcx +; FALLBACK9-NEXT: movq 56(%rdi), %rdi +; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: sarq $63, %rdi +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: andl $56, %ecx +; FALLBACK9-NEXT: andl $56, %eax +; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq %r9, %rsi +; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: movq %r10, %r8 +; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq %r11, %rbx +; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK9-NEXT: sarq %cl, %r11 +; FALLBACK9-NEXT: movq %r15, 8(%rdx) +; FALLBACK9-NEXT: movq %r9, 48(%rdx) +; FALLBACK9-NEXT: movq %r11, 56(%rdx) +; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rbx, 40(%rdx) +; FALLBACK9-NEXT: movq %r8, 16(%rdx) +; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %r14, (%rdx) +; FALLBACK9-NEXT: popq %rbx +; FALLBACK9-NEXT: popq %r14 +; FALLBACK9-NEXT: popq %r15 +; FALLBACK9-NEXT: vzeroupper +; FALLBACK9-NEXT: retq +; +; FALLBACK10-LABEL: ashr_64bytes: +; FALLBACK10: # %bb.0: +; FALLBACK10-NEXT: pushq %rbp +; FALLBACK10-NEXT: pushq %r15 +; FALLBACK10-NEXT: pushq %r14 +; FALLBACK10-NEXT: pushq %r13 +; FALLBACK10-NEXT: pushq %r12 +; FALLBACK10-NEXT: pushq %rbx +; FALLBACK10-NEXT: pushq %rax +; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK10-NEXT: movq 48(%rdi), %rcx +; FALLBACK10-NEXT: movq 56(%rdi), %rdi +; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: sarq $63, %rdi +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: leal (,%rax,8), %esi +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: andl $56, %eax +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK10-NEXT: movl %esi, %ebx +; FALLBACK10-NEXT: notb %bl +; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK10-NEXT: orq %r11, %r8 +; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi +; FALLBACK10-NEXT: addq %rdi, %rdi +; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK10-NEXT: orq %r9, %rdi +; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK10-NEXT: orq %r14, %r9 +; FALLBACK10-NEXT: addq %r10, %r10 +; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK10-NEXT: orq %r15, %r10 +; FALLBACK10-NEXT: addq %rax, %rax +; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK10-NEXT: orq %r13, %rax +; FALLBACK10-NEXT: addq %rcx, %rcx +; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK10-NEXT: orq %rbp, %rcx +; FALLBACK10-NEXT: movq %rsi, 56(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %rax, 48(%rdx) +; FALLBACK10-NEXT: movq %r10, 32(%rdx) +; FALLBACK10-NEXT: movq %r9, 40(%rdx) +; FALLBACK10-NEXT: movq %rdi, 16(%rdx) +; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %r8, (%rdx) +; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: popq %rbx +; FALLBACK10-NEXT: popq %r12 +; FALLBACK10-NEXT: popq %r13 +; FALLBACK10-NEXT: popq %r14 +; FALLBACK10-NEXT: popq %r15 +; FALLBACK10-NEXT: popq %rbp +; FALLBACK10-NEXT: vzeroupper +; FALLBACK10-NEXT: retq +; +; FALLBACK11-LABEL: ashr_64bytes: +; FALLBACK11: # %bb.0: +; FALLBACK11-NEXT: pushq %r15 +; FALLBACK11-NEXT: pushq %r14 +; FALLBACK11-NEXT: pushq %rbx +; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK11-NEXT: movq 48(%rdi), %rcx +; FALLBACK11-NEXT: movq 56(%rdi), %rdi +; FALLBACK11-NEXT: movl (%rsi), %eax +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: sarq $63, %rdi +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK11-NEXT: leal (,%rax,8), %ecx +; FALLBACK11-NEXT: andl $56, %ecx +; FALLBACK11-NEXT: andl $56, %eax +; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq %r9, %rsi +; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK11-NEXT: movq %r10, %r8 +; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK11-NEXT: movq %r11, %rbx +; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK11-NEXT: movq %rax, %r15 +; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10 +; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK11-NEXT: movq %r15, 8(%rdx) +; FALLBACK11-NEXT: movq %r9, 48(%rdx) +; FALLBACK11-NEXT: movq %rdi, 32(%rdx) +; FALLBACK11-NEXT: movq %rbx, 40(%rdx) +; FALLBACK11-NEXT: movq %r8, 16(%rdx) +; FALLBACK11-NEXT: movq %rsi, 24(%rdx) +; FALLBACK11-NEXT: movq %r14, (%rdx) +; FALLBACK11-NEXT: movq %r10, 56(%rdx) +; FALLBACK11-NEXT: popq %rbx +; FALLBACK11-NEXT: popq %r14 +; FALLBACK11-NEXT: popq %r15 +; FALLBACK11-NEXT: vzeroupper +; FALLBACK11-NEXT: retq +; +; FALLBACK12-LABEL: ashr_64bytes: +; FALLBACK12: # %bb.0: +; FALLBACK12-NEXT: pushq %rbp +; FALLBACK12-NEXT: pushq %r15 +; FALLBACK12-NEXT: pushq %r14 +; FALLBACK12-NEXT: pushq %r13 +; FALLBACK12-NEXT: pushq %r12 +; FALLBACK12-NEXT: pushq %rbx +; FALLBACK12-NEXT: pushq %rax +; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK12-NEXT: movq 48(%rdi), %rax +; FALLBACK12-NEXT: movq 56(%rdi), %rcx +; FALLBACK12-NEXT: movl (%rsi), %edi +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: sarq $63, %rcx +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK12-NEXT: leal (,%rdi,8), %eax +; FALLBACK12-NEXT: andl $56, %eax +; FALLBACK12-NEXT: andl $56, %edi +; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10 +; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: notb %sil +; FALLBACK12-NEXT: leaq (%r9,%r9), %r8 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r8 +; FALLBACK12-NEXT: orq %r10, %r8 +; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10 +; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %rbx +; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12 +; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r11 +; FALLBACK12-NEXT: orq %rbx, %r11 +; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx +; FALLBACK12-NEXT: movq %rbx, %r14 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r14 +; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r10 +; FALLBACK12-NEXT: orq %r14, %r10 +; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14 +; FALLBACK12-NEXT: movq %r14, %r13 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r13 +; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp +; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r15 +; FALLBACK12-NEXT: orq %r13, %r15 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r12 +; FALLBACK12-NEXT: addq %r14, %r14 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r14 +; FALLBACK12-NEXT: orq %r12, %r14 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %rbp +; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi +; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12 +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %r12 +; FALLBACK12-NEXT: orq %rbp, %r12 +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: shrq %cl, %r9 +; FALLBACK12-NEXT: addq %rbx, %rbx +; FALLBACK12-NEXT: movl %esi, %ecx +; FALLBACK12-NEXT: shlq %cl, %rbx +; FALLBACK12-NEXT: orq %r9, %rbx +; FALLBACK12-NEXT: movl %eax, %ecx +; FALLBACK12-NEXT: sarq %cl, %rdi +; FALLBACK12-NEXT: movq %rdi, 56(%rdx) +; FALLBACK12-NEXT: movq %rbx, 8(%rdx) +; FALLBACK12-NEXT: movq %r12, 48(%rdx) +; FALLBACK12-NEXT: movq %r14, 32(%rdx) +; FALLBACK12-NEXT: movq %r15, 40(%rdx) +; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r11, 24(%rdx) +; FALLBACK12-NEXT: movq %r8, (%rdx) +; FALLBACK12-NEXT: addq $8, %rsp +; FALLBACK12-NEXT: popq %rbx +; FALLBACK12-NEXT: popq %r12 +; FALLBACK12-NEXT: popq %r13 +; FALLBACK12-NEXT: popq %r14 +; FALLBACK12-NEXT: popq %r15 +; FALLBACK12-NEXT: popq %rbp +; FALLBACK12-NEXT: vzeroupper +; FALLBACK12-NEXT: retq +; +; FALLBACK13-LABEL: ashr_64bytes: +; FALLBACK13: # %bb.0: +; FALLBACK13-NEXT: pushq %r15 +; FALLBACK13-NEXT: pushq %r14 +; FALLBACK13-NEXT: pushq %rbx +; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK13-NEXT: movq 48(%rdi), %rcx +; FALLBACK13-NEXT: movq 56(%rdi), %rdi +; FALLBACK13-NEXT: movl (%rsi), %eax +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: sarq $63, %rdi +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK13-NEXT: leal (,%rax,8), %ecx +; FALLBACK13-NEXT: andl $56, %ecx +; FALLBACK13-NEXT: andl $56, %eax +; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq %r9, %rsi +; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK13-NEXT: movq %r10, %r8 +; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK13-NEXT: movq %r11, %rbx +; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK13-NEXT: movq %rax, %r15 +; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK13-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK13-NEXT: sarq %cl, %r11 +; FALLBACK13-NEXT: movq %r15, 8(%rdx) +; FALLBACK13-NEXT: movq %r9, 48(%rdx) +; FALLBACK13-NEXT: movq %r11, 56(%rdx) +; FALLBACK13-NEXT: movq %rdi, 32(%rdx) +; FALLBACK13-NEXT: movq %rbx, 40(%rdx) +; FALLBACK13-NEXT: movq %r8, 16(%rdx) +; FALLBACK13-NEXT: movq %rsi, 24(%rdx) +; FALLBACK13-NEXT: movq %r14, (%rdx) +; FALLBACK13-NEXT: popq %rbx +; FALLBACK13-NEXT: popq %r14 +; FALLBACK13-NEXT: popq %r15 +; FALLBACK13-NEXT: vzeroupper +; FALLBACK13-NEXT: retq +; +; FALLBACK14-LABEL: ashr_64bytes: +; FALLBACK14: # %bb.0: +; FALLBACK14-NEXT: pushq %rbp +; FALLBACK14-NEXT: pushq %r15 +; FALLBACK14-NEXT: pushq %r14 +; FALLBACK14-NEXT: pushq %r13 +; FALLBACK14-NEXT: pushq %r12 +; FALLBACK14-NEXT: pushq %rbx +; FALLBACK14-NEXT: pushq %rax +; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK14-NEXT: movq 48(%rdi), %rcx +; FALLBACK14-NEXT: movq 56(%rdi), %rdi +; FALLBACK14-NEXT: movl (%rsi), %eax +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: sarq $63, %rdi +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: leal (,%rax,8), %esi +; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: andl $56, %eax +; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx +; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi +; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 +; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 +; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 +; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 +; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 +; FALLBACK14-NEXT: movl %esi, %ebx +; FALLBACK14-NEXT: notb %bl +; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp +; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 +; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 +; FALLBACK14-NEXT: orq %r11, %r8 +; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 +; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 +; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 +; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 +; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi +; FALLBACK14-NEXT: addq %rdi, %rdi +; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK14-NEXT: orq %r9, %rdi +; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 +; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK14-NEXT: orq %r14, %r9 +; FALLBACK14-NEXT: addq %r10, %r10 +; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK14-NEXT: orq %r15, %r10 +; FALLBACK14-NEXT: addq %rax, %rax +; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK14-NEXT: orq %r13, %rax +; FALLBACK14-NEXT: addq %rcx, %rcx +; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK14-NEXT: orq %rbp, %rcx +; FALLBACK14-NEXT: movq %rsi, 56(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %rax, 48(%rdx) +; FALLBACK14-NEXT: movq %r10, 32(%rdx) +; FALLBACK14-NEXT: movq %r9, 40(%rdx) +; FALLBACK14-NEXT: movq %rdi, 16(%rdx) +; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %r8, (%rdx) +; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: popq %rbx +; FALLBACK14-NEXT: popq %r12 +; FALLBACK14-NEXT: popq %r13 +; FALLBACK14-NEXT: popq %r14 +; FALLBACK14-NEXT: popq %r15 +; FALLBACK14-NEXT: popq %rbp +; FALLBACK14-NEXT: vzeroupper +; FALLBACK14-NEXT: retq +; +; FALLBACK15-LABEL: ashr_64bytes: +; FALLBACK15: # %bb.0: +; FALLBACK15-NEXT: pushq %r15 +; FALLBACK15-NEXT: pushq %r14 +; FALLBACK15-NEXT: pushq %rbx +; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 +; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1 +; FALLBACK15-NEXT: movq 48(%rdi), %rcx +; FALLBACK15-NEXT: movq 56(%rdi), %rdi +; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: sarq $63, %rdi +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: andl $56, %ecx +; FALLBACK15-NEXT: andl $56, %eax +; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi +; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq %r9, %rsi +; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi +; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: movq %r10, %r8 +; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 +; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq %r11, %rbx +; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx +; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi +; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 +; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax +; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 +; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10 +; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx +; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: movq %r15, 8(%rdx) +; FALLBACK15-NEXT: movq %r9, 48(%rdx) +; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rbx, 40(%rdx) +; FALLBACK15-NEXT: movq %r8, 16(%rdx) +; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %r14, (%rdx) +; FALLBACK15-NEXT: movq %r10, 56(%rdx) +; FALLBACK15-NEXT: popq %rbx +; FALLBACK15-NEXT: popq %r14 +; FALLBACK15-NEXT: popq %r15 +; FALLBACK15-NEXT: vzeroupper +; FALLBACK15-NEXT: retq +; +; FALLBACK16-LABEL: ashr_64bytes: +; FALLBACK16: # %bb.0: +; FALLBACK16-NEXT: pushl %ebp +; FALLBACK16-NEXT: pushl %ebx +; FALLBACK16-NEXT: pushl %edi +; FALLBACK16-NEXT: pushl %esi +; FALLBACK16-NEXT: subl $204, %esp +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movl (%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 8(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 12(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 16(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 20(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 24(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 28(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 32(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 36(%ecx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 40(%ecx), %ebx +; FALLBACK16-NEXT: movl 44(%ecx), %edi +; FALLBACK16-NEXT: movl 48(%ecx), %esi +; FALLBACK16-NEXT: movl 52(%ecx), %edx +; FALLBACK16-NEXT: movl 56(%ecx), %eax +; FALLBACK16-NEXT: movl 60(%ecx), %ecx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK16-NEXT: movl (%ebp), %ebp +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: sarl $31, %ecx +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, %ecx +; FALLBACK16-NEXT: movl %ebp, %esi +; FALLBACK16-NEXT: andl $60, %esi +; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK16-NEXT: shll $3, %ecx +; FALLBACK16-NEXT: andl $24, %ecx +; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: movl %ecx, %ebx +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: notb %ch +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %eax, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %eax, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %edx, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %ebp, %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %esi, %edx +; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %eax +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %edx, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: addl %ebp, %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %esi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %edx +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: orl %edi, %ebx +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK16-NEXT: movl %edi, %eax +; FALLBACK16-NEXT: movl %edx, %ebx +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %esi, %edi +; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi +; FALLBACK16-NEXT: movl %esi, %eax +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx +; FALLBACK16-NEXT: leal (%edx,%edx), %ebp +; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %esi, %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %eax, %edx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: sarl %cl, %ebx +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: movl %ebx, 60(%eax) +; FALLBACK16-NEXT: movl %edx, 56(%eax) +; FALLBACK16-NEXT: movl %esi, 48(%eax) +; FALLBACK16-NEXT: movl %ebp, 52(%eax) +; FALLBACK16-NEXT: movl %edi, 40(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 44(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 32(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 36(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 24(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 28(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 16(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 20(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 8(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 12(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, (%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 4(%eax) +; FALLBACK16-NEXT: addl $204, %esp +; FALLBACK16-NEXT: popl %esi +; FALLBACK16-NEXT: popl %edi +; FALLBACK16-NEXT: popl %ebx +; FALLBACK16-NEXT: popl %ebp +; FALLBACK16-NEXT: retl +; +; FALLBACK17-LABEL: ashr_64bytes: +; FALLBACK17: # %bb.0: +; FALLBACK17-NEXT: pushl %ebp +; FALLBACK17-NEXT: pushl %ebx +; FALLBACK17-NEXT: pushl %edi +; FALLBACK17-NEXT: pushl %esi +; FALLBACK17-NEXT: subl $188, %esp +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl (%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 4(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 8(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 12(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 16(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 20(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 24(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 28(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 32(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 36(%eax), %ecx +; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 40(%eax), %ebp +; FALLBACK17-NEXT: movl 44(%eax), %ebx +; FALLBACK17-NEXT: movl 48(%eax), %edi +; FALLBACK17-NEXT: movl 52(%eax), %esi +; FALLBACK17-NEXT: movl 56(%eax), %edx +; FALLBACK17-NEXT: movl 60(%eax), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK17-NEXT: movl (%ecx), %ecx +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: sarl $31, %eax +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK17-NEXT: movl %ecx, %ebp +; FALLBACK17-NEXT: andl $60, %ebp +; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shll $3, %ecx +; FALLBACK17-NEXT: andl $24, %ecx +; FALLBACK17-NEXT: shrdl %cl, %edx, %eax +; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %edi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shrdl %cl, %esi, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %esi, %edx +; FALLBACK17-NEXT: shrdl %cl, %eax, %edi +; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edi +; FALLBACK17-NEXT: shrdl %cl, %esi, %edi +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl %eax, %edi +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl %edx, 56(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK17-NEXT: sarl %cl, %eax +; FALLBACK17-NEXT: movl %eax, 60(%ebp) +; FALLBACK17-NEXT: movl %esi, 48(%ebp) +; FALLBACK17-NEXT: movl %edi, 52(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 40(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 44(%ebp) +; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 32(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 36(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 24(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 28(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 16(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 20(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 8(%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 12(%ebp) +; FALLBACK17-NEXT: movl %ebx, (%ebp) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: addl $188, %esp +; FALLBACK17-NEXT: popl %esi +; FALLBACK17-NEXT: popl %edi +; FALLBACK17-NEXT: popl %ebx +; FALLBACK17-NEXT: popl %ebp +; FALLBACK17-NEXT: retl +; +; FALLBACK18-LABEL: ashr_64bytes: +; FALLBACK18: # %bb.0: +; FALLBACK18-NEXT: pushl %ebp +; FALLBACK18-NEXT: pushl %ebx +; FALLBACK18-NEXT: pushl %edi +; FALLBACK18-NEXT: pushl %esi +; FALLBACK18-NEXT: subl $204, %esp +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 12(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 16(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 20(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 24(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 28(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 36(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%eax), %ebp +; FALLBACK18-NEXT: movl 44(%eax), %ebx +; FALLBACK18-NEXT: movl 48(%eax), %edi +; FALLBACK18-NEXT: movl 52(%eax), %esi +; FALLBACK18-NEXT: movl 56(%eax), %edx +; FALLBACK18-NEXT: movl 60(%eax), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: sarl $31, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: andl $60, %ecx +; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: leal (%edi,%edi), %ebp +; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: orl %edi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %eax, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %eax, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: addl %edi, %edi +; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: leal (%eax,%eax), %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl %ecx, %edi +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: addl %esi, %esi +; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx +; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax +; FALLBACK18-NEXT: shrxl %edx, %eax, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %eax, %eax +; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp +; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx +; FALLBACK18-NEXT: addl %ebp, %ebp +; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx +; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK18-NEXT: movl %edx, 60(%eax) +; FALLBACK18-NEXT: movl %ebx, 56(%eax) +; FALLBACK18-NEXT: movl %edi, 48(%eax) +; FALLBACK18-NEXT: movl %ecx, 52(%eax) +; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 44(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 32(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 36(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 24(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 28(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 16(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 20(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 8(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, (%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: addl $204, %esp +; FALLBACK18-NEXT: popl %esi +; FALLBACK18-NEXT: popl %edi +; FALLBACK18-NEXT: popl %ebx +; FALLBACK18-NEXT: popl %ebp +; FALLBACK18-NEXT: retl +; +; FALLBACK19-LABEL: ashr_64bytes: +; FALLBACK19: # %bb.0: +; FALLBACK19-NEXT: pushl %ebp +; FALLBACK19-NEXT: pushl %ebx +; FALLBACK19-NEXT: pushl %edi +; FALLBACK19-NEXT: pushl %esi +; FALLBACK19-NEXT: subl $188, %esp +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK19-NEXT: movl (%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 4(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 8(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 12(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 16(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 20(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 24(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 28(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 32(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 36(%eax), %ecx +; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 40(%eax), %ebp +; FALLBACK19-NEXT: movl 44(%eax), %ebx +; FALLBACK19-NEXT: movl 48(%eax), %edi +; FALLBACK19-NEXT: movl 52(%eax), %esi +; FALLBACK19-NEXT: movl 56(%eax), %edx +; FALLBACK19-NEXT: movl 60(%eax), %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK19-NEXT: movl (%ecx), %ecx +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: sarl $31, %eax +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK19-NEXT: movl %ecx, %ebp +; FALLBACK19-NEXT: andl $60, %ebp +; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shll $3, %ecx +; FALLBACK19-NEXT: andl $24, %ecx +; FALLBACK19-NEXT: shrdl %cl, %edx, %eax +; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %esi +; FALLBACK19-NEXT: shrdl %cl, %edi, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %esi, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %edi +; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %edi, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %esi +; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %edi +; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %esi, %edx +; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl %edi, %edx +; FALLBACK19-NEXT: shrdl %cl, %eax, %edx +; FALLBACK19-NEXT: shrdl %cl, %edi, %esi +; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK19-NEXT: movl %eax, 56(%ebp) +; FALLBACK19-NEXT: movl %esi, 48(%ebp) +; FALLBACK19-NEXT: movl %edx, 52(%ebp) +; FALLBACK19-NEXT: movl %ebx, 40(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 44(%ebp) +; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 32(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 36(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 24(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 28(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 16(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 20(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 8(%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK19-NEXT: movl %eax, 12(%ebp) +; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK19-NEXT: shrdl %cl, %edx, %edi +; FALLBACK19-NEXT: movl %edi, (%ebp) +; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK19-NEXT: movl %ecx, 4(%ebp) +; FALLBACK19-NEXT: movl %eax, 60(%ebp) +; FALLBACK19-NEXT: addl $188, %esp +; FALLBACK19-NEXT: popl %esi +; FALLBACK19-NEXT: popl %edi +; FALLBACK19-NEXT: popl %ebx +; FALLBACK19-NEXT: popl %ebp +; FALLBACK19-NEXT: retl +; +; FALLBACK20-LABEL: ashr_64bytes: +; FALLBACK20: # %bb.0: +; FALLBACK20-NEXT: pushl %ebp +; FALLBACK20-NEXT: pushl %ebx +; FALLBACK20-NEXT: pushl %edi +; FALLBACK20-NEXT: pushl %esi +; FALLBACK20-NEXT: subl $204, %esp +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK20-NEXT: movups (%ecx), %xmm0 +; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK20-NEXT: movl 48(%ecx), %edx +; FALLBACK20-NEXT: movl 52(%ecx), %esi +; FALLBACK20-NEXT: movl 56(%ecx), %edi +; FALLBACK20-NEXT: movl 60(%ecx), %ecx +; FALLBACK20-NEXT: movl (%eax), %eax +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: sarl $31, %ecx +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK20-NEXT: movl %eax, %esi +; FALLBACK20-NEXT: andl $60, %esi +; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK20-NEXT: shll $3, %eax +; FALLBACK20-NEXT: andl $24, %eax +; FALLBACK20-NEXT: movl %edx, %edi +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movb %al, %ch +; FALLBACK20-NEXT: notb %ch +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %edi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %edi, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK20-NEXT: movl %edx, %ebp +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %ebx, %edx +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: addl %eax, %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %edi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %eax, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK20-NEXT: movl %ebx, %ebp +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK20-NEXT: leal (%edx,%edx), %eax +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %edi, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK20-NEXT: movl %edi, %ebp +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebx +; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %edx, %edi +; FALLBACK20-NEXT: movl %esi, %edx +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK20-NEXT: movl %esi, %ebx +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK20-NEXT: leal (%eax,%eax), %ebp +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: addl %esi, %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: orl %eax, %edx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK20-NEXT: sarl %cl, %ebx +; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK20-NEXT: movl %ebx, 60(%eax) +; FALLBACK20-NEXT: movl %edx, 56(%eax) +; FALLBACK20-NEXT: movl %esi, 48(%eax) +; FALLBACK20-NEXT: movl %ebp, 52(%eax) +; FALLBACK20-NEXT: movl %edi, 40(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 44(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 32(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 36(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 24(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 28(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 16(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 20(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 8(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 12(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, (%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 4(%eax) +; FALLBACK20-NEXT: addl $204, %esp +; FALLBACK20-NEXT: popl %esi +; FALLBACK20-NEXT: popl %edi +; FALLBACK20-NEXT: popl %ebx +; FALLBACK20-NEXT: popl %ebp +; FALLBACK20-NEXT: retl +; +; FALLBACK21-LABEL: ashr_64bytes: +; FALLBACK21: # %bb.0: +; FALLBACK21-NEXT: pushl %ebp +; FALLBACK21-NEXT: pushl %ebx +; FALLBACK21-NEXT: pushl %edi +; FALLBACK21-NEXT: pushl %esi +; FALLBACK21-NEXT: subl $188, %esp +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movups (%eax), %xmm0 +; FALLBACK21-NEXT: movups 16(%eax), %xmm1 +; FALLBACK21-NEXT: movups 32(%eax), %xmm2 +; FALLBACK21-NEXT: movl 48(%eax), %edx +; FALLBACK21-NEXT: movl 52(%eax), %esi +; FALLBACK21-NEXT: movl 56(%eax), %edi +; FALLBACK21-NEXT: movl 60(%eax), %eax +; FALLBACK21-NEXT: movl (%ecx), %ecx +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: sarl $31, %eax +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK21-NEXT: movl %ecx, %ebp +; FALLBACK21-NEXT: andl $60, %ebp +; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shll $3, %ecx +; FALLBACK21-NEXT: andl $24, %ecx +; FALLBACK21-NEXT: shrdl %cl, %edx, %eax +; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %esi +; FALLBACK21-NEXT: shrdl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %edi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shrdl %cl, %esi, %edx +; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %esi, %edx +; FALLBACK21-NEXT: shrdl %cl, %eax, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edi +; FALLBACK21-NEXT: shrdl %cl, %esi, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl %eax, %edi +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %edx, 56(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK21-NEXT: sarl %cl, %eax +; FALLBACK21-NEXT: movl %eax, 60(%ebp) +; FALLBACK21-NEXT: movl %esi, 48(%ebp) +; FALLBACK21-NEXT: movl %edi, 52(%ebp) +; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 40(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 44(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 32(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 36(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 24(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 28(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 16(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 20(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 8(%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 12(%ebp) +; FALLBACK21-NEXT: movl %ebx, (%ebp) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK21-NEXT: movl %eax, 4(%ebp) +; FALLBACK21-NEXT: addl $188, %esp +; FALLBACK21-NEXT: popl %esi +; FALLBACK21-NEXT: popl %edi +; FALLBACK21-NEXT: popl %ebx +; FALLBACK21-NEXT: popl %ebp +; FALLBACK21-NEXT: retl +; +; FALLBACK22-LABEL: ashr_64bytes: +; FALLBACK22: # %bb.0: +; FALLBACK22-NEXT: pushl %ebp +; FALLBACK22-NEXT: pushl %ebx +; FALLBACK22-NEXT: pushl %edi +; FALLBACK22-NEXT: pushl %esi +; FALLBACK22-NEXT: subl $204, %esp +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movups (%ecx), %xmm0 +; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 +; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 +; FALLBACK22-NEXT: movl 48(%ecx), %edx +; FALLBACK22-NEXT: movl 52(%ecx), %esi +; FALLBACK22-NEXT: movl 56(%ecx), %edi +; FALLBACK22-NEXT: movl 60(%ecx), %ecx +; FALLBACK22-NEXT: movl (%eax), %eax +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: sarl $31, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: leal (,%eax,8), %edx +; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: andl $60, %ecx +; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: leal (%edi,%edi), %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %eax, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %eax, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl %ecx, %edi +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: addl %esi, %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp +; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax +; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ecx +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp +; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx +; FALLBACK22-NEXT: addl %ebp, %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx +; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK22-NEXT: movl %edx, 60(%eax) +; FALLBACK22-NEXT: movl %ebx, 56(%eax) +; FALLBACK22-NEXT: movl %edi, 48(%eax) +; FALLBACK22-NEXT: movl %ecx, 52(%eax) +; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 44(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 32(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 36(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 24(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 28(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 16(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 20(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 8(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 12(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, (%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: addl $204, %esp +; FALLBACK22-NEXT: popl %esi +; FALLBACK22-NEXT: popl %edi +; FALLBACK22-NEXT: popl %ebx +; FALLBACK22-NEXT: popl %ebp +; FALLBACK22-NEXT: retl +; +; FALLBACK23-LABEL: ashr_64bytes: +; FALLBACK23: # %bb.0: +; FALLBACK23-NEXT: pushl %ebp +; FALLBACK23-NEXT: pushl %ebx +; FALLBACK23-NEXT: pushl %edi +; FALLBACK23-NEXT: pushl %esi +; FALLBACK23-NEXT: subl $188, %esp +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK23-NEXT: movups (%eax), %xmm0 +; FALLBACK23-NEXT: movups 16(%eax), %xmm1 +; FALLBACK23-NEXT: movups 32(%eax), %xmm2 +; FALLBACK23-NEXT: movl 48(%eax), %edx +; FALLBACK23-NEXT: movl 52(%eax), %esi +; FALLBACK23-NEXT: movl 56(%eax), %edi +; FALLBACK23-NEXT: movl 60(%eax), %eax +; FALLBACK23-NEXT: movl (%ecx), %ecx +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: sarl $31, %eax +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK23-NEXT: movl %ecx, %ebp +; FALLBACK23-NEXT: andl $60, %ebp +; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shll $3, %ecx +; FALLBACK23-NEXT: andl $24, %ecx +; FALLBACK23-NEXT: shrdl %cl, %edx, %eax +; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %esi +; FALLBACK23-NEXT: shrdl %cl, %edi, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %edi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %esi +; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %edi +; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %esi, %edx +; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl %edi, %edx +; FALLBACK23-NEXT: shrdl %cl, %eax, %edx +; FALLBACK23-NEXT: shrdl %cl, %edi, %esi +; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill +; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK23-NEXT: movl %eax, 56(%ebp) +; FALLBACK23-NEXT: movl %esi, 48(%ebp) +; FALLBACK23-NEXT: movl %edx, 52(%ebp) +; FALLBACK23-NEXT: movl %ebx, 40(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 44(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 32(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 36(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 24(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 28(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 16(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 20(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 8(%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK23-NEXT: movl %eax, 12(%ebp) +; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK23-NEXT: shrdl %cl, %edx, %edi +; FALLBACK23-NEXT: movl %edi, (%ebp) +; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK23-NEXT: movl %ecx, 4(%ebp) +; FALLBACK23-NEXT: movl %eax, 60(%ebp) +; FALLBACK23-NEXT: addl $188, %esp +; FALLBACK23-NEXT: popl %esi +; FALLBACK23-NEXT: popl %edi +; FALLBACK23-NEXT: popl %ebx +; FALLBACK23-NEXT: popl %ebp +; FALLBACK23-NEXT: retl +; +; FALLBACK24-LABEL: ashr_64bytes: +; FALLBACK24: # %bb.0: +; FALLBACK24-NEXT: pushl %ebp +; FALLBACK24-NEXT: pushl %ebx +; FALLBACK24-NEXT: pushl %edi +; FALLBACK24-NEXT: pushl %esi +; FALLBACK24-NEXT: subl $204, %esp +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1 +; FALLBACK24-NEXT: movl 48(%ecx), %edx +; FALLBACK24-NEXT: movl 52(%ecx), %esi +; FALLBACK24-NEXT: movl 56(%ecx), %edi +; FALLBACK24-NEXT: movl 60(%ecx), %ecx +; FALLBACK24-NEXT: movl (%eax), %eax +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: sarl $31, %ecx +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, %esi +; FALLBACK24-NEXT: andl $60, %esi +; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK24-NEXT: shll $3, %eax +; FALLBACK24-NEXT: andl $24, %eax +; FALLBACK24-NEXT: movl %edx, %edi +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movb %al, %ch +; FALLBACK24-NEXT: notb %ch +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %edi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %edi, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK24-NEXT: movl %edx, %ebp +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %ebx, %edx +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: movl %eax, %edx +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: addl %eax, %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %edi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %eax, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK24-NEXT: movl %ebx, %ebp +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK24-NEXT: leal (%edx,%edx), %eax +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %edi, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK24-NEXT: movl %edi, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebx +; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %edx, %edi +; FALLBACK24-NEXT: movl %esi, %edx +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK24-NEXT: movl %esi, %ebx +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK24-NEXT: leal (%eax,%eax), %ebp +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: addl %esi, %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: orl %eax, %edx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK24-NEXT: sarl %cl, %ebx +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: movl %ebx, 60(%eax) +; FALLBACK24-NEXT: movl %edx, 56(%eax) +; FALLBACK24-NEXT: movl %esi, 48(%eax) +; FALLBACK24-NEXT: movl %ebp, 52(%eax) +; FALLBACK24-NEXT: movl %edi, 40(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 44(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 32(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 36(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 24(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 28(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 16(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 20(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 8(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 12(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, (%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 4(%eax) +; FALLBACK24-NEXT: addl $204, %esp +; FALLBACK24-NEXT: popl %esi +; FALLBACK24-NEXT: popl %edi +; FALLBACK24-NEXT: popl %ebx +; FALLBACK24-NEXT: popl %ebp +; FALLBACK24-NEXT: vzeroupper +; FALLBACK24-NEXT: retl +; +; FALLBACK25-LABEL: ashr_64bytes: +; FALLBACK25: # %bb.0: +; FALLBACK25-NEXT: pushl %ebp +; FALLBACK25-NEXT: pushl %ebx +; FALLBACK25-NEXT: pushl %edi +; FALLBACK25-NEXT: pushl %esi +; FALLBACK25-NEXT: subl $188, %esp +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: vmovups (%eax), %ymm0 +; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1 +; FALLBACK25-NEXT: movl 48(%eax), %edx +; FALLBACK25-NEXT: movl 52(%eax), %esi +; FALLBACK25-NEXT: movl 56(%eax), %edi +; FALLBACK25-NEXT: movl 60(%eax), %eax +; FALLBACK25-NEXT: movl (%ecx), %ecx +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: sarl $31, %eax +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK25-NEXT: movl %ecx, %ebp +; FALLBACK25-NEXT: andl $60, %ebp +; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shll $3, %ecx +; FALLBACK25-NEXT: andl $24, %ecx +; FALLBACK25-NEXT: shrdl %cl, %edx, %eax +; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %esi +; FALLBACK25-NEXT: shrdl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %edi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shrdl %cl, %esi, %edx +; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %esi, %edx +; FALLBACK25-NEXT: shrdl %cl, %eax, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edi +; FALLBACK25-NEXT: shrdl %cl, %esi, %edi +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl %eax, %edi +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %edx, 56(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK25-NEXT: sarl %cl, %eax +; FALLBACK25-NEXT: movl %eax, 60(%ebp) +; FALLBACK25-NEXT: movl %esi, 48(%ebp) +; FALLBACK25-NEXT: movl %edi, 52(%ebp) +; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 40(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 44(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 32(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 36(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 24(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 28(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 16(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 20(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 8(%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 12(%ebp) +; FALLBACK25-NEXT: movl %ebx, (%ebp) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK25-NEXT: movl %eax, 4(%ebp) +; FALLBACK25-NEXT: addl $188, %esp +; FALLBACK25-NEXT: popl %esi +; FALLBACK25-NEXT: popl %edi +; FALLBACK25-NEXT: popl %ebx +; FALLBACK25-NEXT: popl %ebp +; FALLBACK25-NEXT: vzeroupper +; FALLBACK25-NEXT: retl +; +; FALLBACK26-LABEL: ashr_64bytes: +; FALLBACK26: # %bb.0: +; FALLBACK26-NEXT: pushl %ebp +; FALLBACK26-NEXT: pushl %ebx +; FALLBACK26-NEXT: pushl %edi +; FALLBACK26-NEXT: pushl %esi +; FALLBACK26-NEXT: subl $204, %esp +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1 +; FALLBACK26-NEXT: movl 48(%ecx), %edx +; FALLBACK26-NEXT: movl 52(%ecx), %esi +; FALLBACK26-NEXT: movl 56(%ecx), %edi +; FALLBACK26-NEXT: movl 60(%ecx), %ecx +; FALLBACK26-NEXT: movl (%eax), %eax +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: sarl $31, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: leal (,%eax,8), %edx +; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: andl $60, %ecx +; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: leal (%edi,%edi), %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %edi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %eax, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %eax, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: addl %edi, %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK26-NEXT: movl %ecx, %edi +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: addl %esi, %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp +; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx +; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax +; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ecx +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp +; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx +; FALLBACK26-NEXT: addl %ebp, %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx +; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK26-NEXT: movl %edx, 60(%eax) +; FALLBACK26-NEXT: movl %ebx, 56(%eax) +; FALLBACK26-NEXT: movl %edi, 48(%eax) +; FALLBACK26-NEXT: movl %ecx, 52(%eax) +; FALLBACK26-NEXT: movl %esi, 40(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 44(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 32(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 36(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 24(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 28(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 16(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 20(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 8(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 12(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, (%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 4(%eax) +; FALLBACK26-NEXT: addl $204, %esp +; FALLBACK26-NEXT: popl %esi +; FALLBACK26-NEXT: popl %edi +; FALLBACK26-NEXT: popl %ebx +; FALLBACK26-NEXT: popl %ebp +; FALLBACK26-NEXT: vzeroupper +; FALLBACK26-NEXT: retl +; +; FALLBACK27-LABEL: ashr_64bytes: +; FALLBACK27: # %bb.0: +; FALLBACK27-NEXT: pushl %ebp +; FALLBACK27-NEXT: pushl %ebx +; FALLBACK27-NEXT: pushl %edi +; FALLBACK27-NEXT: pushl %esi +; FALLBACK27-NEXT: subl $188, %esp +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK27-NEXT: vmovups (%eax), %ymm0 +; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1 +; FALLBACK27-NEXT: movl 48(%eax), %edx +; FALLBACK27-NEXT: movl 52(%eax), %esi +; FALLBACK27-NEXT: movl 56(%eax), %edi +; FALLBACK27-NEXT: movl 60(%eax), %eax +; FALLBACK27-NEXT: movl (%ecx), %ecx +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: sarl $31, %eax +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK27-NEXT: movl %ecx, %ebp +; FALLBACK27-NEXT: andl $60, %ebp +; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shll $3, %ecx +; FALLBACK27-NEXT: andl $24, %ecx +; FALLBACK27-NEXT: shrdl %cl, %edx, %eax +; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %esi +; FALLBACK27-NEXT: shrdl %cl, %edi, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %edi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %esi +; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %edi +; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %esi, %edx +; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl %edi, %edx +; FALLBACK27-NEXT: shrdl %cl, %eax, %edx +; FALLBACK27-NEXT: shrdl %cl, %edi, %esi +; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill +; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK27-NEXT: movl %eax, 56(%ebp) +; FALLBACK27-NEXT: movl %esi, 48(%ebp) +; FALLBACK27-NEXT: movl %edx, 52(%ebp) +; FALLBACK27-NEXT: movl %ebx, 40(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 44(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 32(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 36(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 24(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 28(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 16(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 20(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 8(%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK27-NEXT: movl %eax, 12(%ebp) +; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK27-NEXT: shrdl %cl, %edx, %edi +; FALLBACK27-NEXT: movl %edi, (%ebp) +; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK27-NEXT: movl %ecx, 4(%ebp) +; FALLBACK27-NEXT: movl %eax, 60(%ebp) +; FALLBACK27-NEXT: addl $188, %esp +; FALLBACK27-NEXT: popl %esi +; FALLBACK27-NEXT: popl %edi +; FALLBACK27-NEXT: popl %ebx +; FALLBACK27-NEXT: popl %ebp +; FALLBACK27-NEXT: vzeroupper +; FALLBACK27-NEXT: retl +; +; FALLBACK28-LABEL: ashr_64bytes: +; FALLBACK28: # %bb.0: +; FALLBACK28-NEXT: pushl %ebp +; FALLBACK28-NEXT: pushl %ebx +; FALLBACK28-NEXT: pushl %edi +; FALLBACK28-NEXT: pushl %esi +; FALLBACK28-NEXT: subl $204, %esp +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1 +; FALLBACK28-NEXT: movl 48(%ecx), %edx +; FALLBACK28-NEXT: movl 52(%ecx), %esi +; FALLBACK28-NEXT: movl 56(%ecx), %edi +; FALLBACK28-NEXT: movl 60(%ecx), %ecx +; FALLBACK28-NEXT: movl (%eax), %eax +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: sarl $31, %ecx +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, %esi +; FALLBACK28-NEXT: andl $60, %esi +; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK28-NEXT: shll $3, %eax +; FALLBACK28-NEXT: andl $24, %eax +; FALLBACK28-NEXT: movl %edx, %edi +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movb %al, %ch +; FALLBACK28-NEXT: notb %ch +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %edi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %edi, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx +; FALLBACK28-NEXT: movl %edx, %ebp +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %ebx, %edx +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: movl %eax, %edx +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: addl %eax, %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %edi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %eax, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx +; FALLBACK28-NEXT: movl %ebx, %ebp +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx +; FALLBACK28-NEXT: leal (%edx,%edx), %eax +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %edi, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi +; FALLBACK28-NEXT: movl %edi, %ebp +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebx +; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %edx, %edi +; FALLBACK28-NEXT: movl %esi, %edx +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi +; FALLBACK28-NEXT: movl %esi, %ebx +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax +; FALLBACK28-NEXT: leal (%eax,%eax), %ebp +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: addl %esi, %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: orl %eax, %edx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK28-NEXT: sarl %cl, %ebx +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: movl %ebx, 60(%eax) +; FALLBACK28-NEXT: movl %edx, 56(%eax) +; FALLBACK28-NEXT: movl %esi, 48(%eax) +; FALLBACK28-NEXT: movl %ebp, 52(%eax) +; FALLBACK28-NEXT: movl %edi, 40(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 44(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 32(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 36(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 24(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 28(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 16(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 20(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 8(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 12(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, (%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 4(%eax) +; FALLBACK28-NEXT: addl $204, %esp +; FALLBACK28-NEXT: popl %esi +; FALLBACK28-NEXT: popl %edi +; FALLBACK28-NEXT: popl %ebx +; FALLBACK28-NEXT: popl %ebp +; FALLBACK28-NEXT: vzeroupper +; FALLBACK28-NEXT: retl +; +; FALLBACK29-LABEL: ashr_64bytes: +; FALLBACK29: # %bb.0: +; FALLBACK29-NEXT: pushl %ebp +; FALLBACK29-NEXT: pushl %ebx +; FALLBACK29-NEXT: pushl %edi +; FALLBACK29-NEXT: pushl %esi +; FALLBACK29-NEXT: subl $188, %esp +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: vmovups (%eax), %ymm0 +; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1 +; FALLBACK29-NEXT: movl 48(%eax), %edx +; FALLBACK29-NEXT: movl 52(%eax), %esi +; FALLBACK29-NEXT: movl 56(%eax), %edi +; FALLBACK29-NEXT: movl 60(%eax), %eax +; FALLBACK29-NEXT: movl (%ecx), %ecx +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: sarl $31, %eax +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK29-NEXT: movl %ecx, %ebp +; FALLBACK29-NEXT: andl $60, %ebp +; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shll $3, %ecx +; FALLBACK29-NEXT: andl $24, %ecx +; FALLBACK29-NEXT: shrdl %cl, %edx, %eax +; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %esi +; FALLBACK29-NEXT: shrdl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %edi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shrdl %cl, %esi, %edx +; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %esi, %edx +; FALLBACK29-NEXT: shrdl %cl, %eax, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edi +; FALLBACK29-NEXT: shrdl %cl, %esi, %edi +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx +; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl %eax, %edi +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx +; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax +; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %edx, 56(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK29-NEXT: sarl %cl, %eax +; FALLBACK29-NEXT: movl %eax, 60(%ebp) +; FALLBACK29-NEXT: movl %esi, 48(%ebp) +; FALLBACK29-NEXT: movl %edi, 52(%ebp) +; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 40(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 44(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 32(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 36(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 24(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 28(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 16(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 20(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 8(%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 12(%ebp) +; FALLBACK29-NEXT: movl %ebx, (%ebp) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK29-NEXT: movl %eax, 4(%ebp) +; FALLBACK29-NEXT: addl $188, %esp +; FALLBACK29-NEXT: popl %esi +; FALLBACK29-NEXT: popl %edi +; FALLBACK29-NEXT: popl %ebx +; FALLBACK29-NEXT: popl %ebp +; FALLBACK29-NEXT: vzeroupper +; FALLBACK29-NEXT: retl +; +; FALLBACK30-LABEL: ashr_64bytes: +; FALLBACK30: # %bb.0: +; FALLBACK30-NEXT: pushl %ebp +; FALLBACK30-NEXT: pushl %ebx +; FALLBACK30-NEXT: pushl %edi +; FALLBACK30-NEXT: pushl %esi +; FALLBACK30-NEXT: subl $204, %esp +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 +; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1 +; FALLBACK30-NEXT: movl 48(%ecx), %edx +; FALLBACK30-NEXT: movl 52(%ecx), %esi +; FALLBACK30-NEXT: movl 56(%ecx), %edi +; FALLBACK30-NEXT: movl 60(%ecx), %ecx +; FALLBACK30-NEXT: movl (%eax), %eax +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: sarl $31, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: leal (,%eax,8), %edx +; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: andl $60, %ecx +; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: leal (%edi,%edi), %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %eax, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %eax, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: addl %edi, %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl %ecx, %edi +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: addl %esi, %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp +; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax +; FALLBACK30-NEXT: shrxl %edx, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ecx +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp +; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx +; FALLBACK30-NEXT: addl %ebp, %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx +; FALLBACK30-NEXT: orl %eax, %ebx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK30-NEXT: movl %edx, 60(%eax) +; FALLBACK30-NEXT: movl %ebx, 56(%eax) +; FALLBACK30-NEXT: movl %edi, 48(%eax) +; FALLBACK30-NEXT: movl %ecx, 52(%eax) +; FALLBACK30-NEXT: movl %esi, 40(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 44(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 32(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 36(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 24(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 28(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 16(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 20(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 8(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 12(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, (%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 4(%eax) +; FALLBACK30-NEXT: addl $204, %esp +; FALLBACK30-NEXT: popl %esi +; FALLBACK30-NEXT: popl %edi +; FALLBACK30-NEXT: popl %ebx +; FALLBACK30-NEXT: popl %ebp +; FALLBACK30-NEXT: vzeroupper +; FALLBACK30-NEXT: retl +; +; FALLBACK31-LABEL: ashr_64bytes: +; FALLBACK31: # %bb.0: +; FALLBACK31-NEXT: pushl %ebp +; FALLBACK31-NEXT: pushl %ebx +; FALLBACK31-NEXT: pushl %edi +; FALLBACK31-NEXT: pushl %esi +; FALLBACK31-NEXT: subl $188, %esp +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK31-NEXT: vmovups (%eax), %ymm0 +; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1 +; FALLBACK31-NEXT: movl 48(%eax), %edx +; FALLBACK31-NEXT: movl 52(%eax), %esi +; FALLBACK31-NEXT: movl 56(%eax), %edi +; FALLBACK31-NEXT: movl 60(%eax), %eax +; FALLBACK31-NEXT: movl (%ecx), %ecx +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: sarl $31, %eax +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK31-NEXT: movl %ecx, %ebp +; FALLBACK31-NEXT: andl $60, %ebp +; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx +; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shll $3, %ecx +; FALLBACK31-NEXT: andl $24, %ecx +; FALLBACK31-NEXT: shrdl %cl, %edx, %eax +; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %esi +; FALLBACK31-NEXT: shrdl %cl, %edi, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi +; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %edi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %esi +; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx +; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %edi +; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi +; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %esi, %edx +; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx +; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax +; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl %edi, %edx +; FALLBACK31-NEXT: shrdl %cl, %eax, %edx +; FALLBACK31-NEXT: shrdl %cl, %edi, %esi +; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi +; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp +; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill +; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax +; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK31-NEXT: movl %eax, 56(%ebp) +; FALLBACK31-NEXT: movl %esi, 48(%ebp) +; FALLBACK31-NEXT: movl %edx, 52(%ebp) +; FALLBACK31-NEXT: movl %ebx, 40(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 44(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 32(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 36(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 24(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 28(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 16(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 20(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 8(%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK31-NEXT: movl %eax, 12(%ebp) +; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload +; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK31-NEXT: shrdl %cl, %edx, %edi +; FALLBACK31-NEXT: movl %edi, (%ebp) +; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK31-NEXT: movl %ecx, 4(%ebp) +; FALLBACK31-NEXT: movl %eax, 60(%ebp) +; FALLBACK31-NEXT: addl $188, %esp +; FALLBACK31-NEXT: popl %esi +; FALLBACK31-NEXT: popl %edi +; FALLBACK31-NEXT: popl %ebx +; FALLBACK31-NEXT: popl %ebp +; FALLBACK31-NEXT: vzeroupper +; FALLBACK31-NEXT: retl + %src = load i512, ptr %src.ptr, align 1 + %byteOff = load i512, ptr %byteOff.ptr, align 1 + %bitOff = shl i512 %byteOff, 3 + %res = ashr i512 %src, %bitOff + store i512 %res, ptr %dst, align 1 + ret void +} + +define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { +; X64-SSE2-LABEL: ashr_64bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rbx ; X64-SSE2-NEXT: movq (%rdi), %rax @@ -2394,15 +24296,15 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-SSE2-NEXT: andl $63, %esi -; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax -; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: andl $7, %esi +; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax +; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx +; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi +; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8 +; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9 +; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10 +; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11 +; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 48(%rdx) ; X64-SSE2-NEXT: movq %r11, 56(%rdx) ; X64-SSE2-NEXT: movq %r10, 32(%rdx) @@ -2414,8 +24316,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; -; X64-SSE42-LABEL: ashr_64bytes: +; X64-SSE42-LABEL: ashr_64bytes_qwordOff: ; X64-SSE42: # %bb.0: +; X64-SSE42-NEXT: pushq %rax ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 @@ -2424,9 +24327,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE42-NEXT: movl (%rsi), %esi ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: sarq $63, %rcx ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) @@ -2436,19 +24339,21 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-SSE42-NEXT: andl $63, %esi -; X64-SSE42-NEXT: movups -128(%rsp,%rsi), %xmm0 -; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1 -; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2 -; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3 +; X64-SSE42-NEXT: andl $7, %esi +; X64-SSE42-NEXT: movups -128(%rsp,%rsi,8), %xmm0 +; X64-SSE42-NEXT: movups -112(%rsp,%rsi,8), %xmm1 +; X64-SSE42-NEXT: movups -96(%rsp,%rsi,8), %xmm2 +; X64-SSE42-NEXT: movups -80(%rsp,%rsi,8), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) +; X64-SSE42-NEXT: popq %rax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_64bytes: +; X64-AVX-LABEL: ashr_64bytes_qwordOff: ; X64-AVX: # %bb.0: +; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1 ; X64-AVX-NEXT: movq 48(%rdi), %rax @@ -2456,7 +24361,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX-NEXT: movl (%rsi), %esi ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: sarq $63, %rcx ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) @@ -2467,25 +24372,26 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $63, %esi -; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2 -; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3 +; X64-AVX-NEXT: andl $7, %esi +; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0 +; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1 +; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2 +; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3 ; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; -; X86-SSE2-LABEL: ashr_64bytes: +; X86-SSE2-LABEL: ashr_64bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: subl $168, %esp +; X86-SSE2-NEXT: subl $188, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -2506,7 +24412,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl 32(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 40(%eax), %ebp ; X86-SSE2-NEXT: movl 44(%eax), %ebx ; X86-SSE2-NEXT: movl 48(%eax), %edi @@ -2520,7 +24426,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -2558,33 +24464,33 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: andl $63, %eax -; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx +; X86-SSE2-NEXT: andl $7, %eax +; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp +; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx +; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi +; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi +; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx +; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 56(%eax) ; X86-SSE2-NEXT: movl %edx, 60(%eax) @@ -2592,7 +24498,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) @@ -2612,14 +24518,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) -; X86-SSE2-NEXT: addl $168, %esp +; X86-SSE2-NEXT: addl $188, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X86-SSE42-LABEL: ashr_64bytes: +; X86-SSE42-LABEL: ashr_64bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi @@ -2640,9 +24546,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: movups %xmm0, (%esp) +; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: sarl $31, %edx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -2660,11 +24566,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE42-NEXT: andl $63, %ecx -; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0 -; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1 -; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2 -; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3 +; X86-SSE42-NEXT: andl $7, %ecx +; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 +; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 +; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2 +; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax) ; X86-SSE42-NEXT: movups %xmm2, 32(%eax) ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) @@ -2675,7 +24581,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: ashr_64bytes: +; X86-AVX-LABEL: ashr_64bytes_qwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi @@ -2695,7 +24601,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) ; X86-AVX-NEXT: sarl $31, %edx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -2714,11 +24620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andl $63, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3 +; X86-AVX-NEXT: andl $7, %ecx +; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0 +; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 +; X86-AVX-NEXT: vmovups 32(%esp,%ecx,8), %xmm2 +; X86-AVX-NEXT: vmovups 48(%esp,%ecx,8), %xmm3 ; X86-AVX-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) @@ -2730,45 +24636,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i512, ptr %src.ptr, align 1 - %byteOff = load i512, ptr %byteOff.ptr, align 1 - %bitOff = shl i512 %byteOff, 3 + %qwordOff = load i512, ptr %qwordOff.ptr, align 1 + %bitOff = shl i512 %qwordOff, 6 %res = ashr i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; FALLBACK0: {{.*}} -; FALLBACK1: {{.*}} -; FALLBACK10: {{.*}} -; FALLBACK11: {{.*}} -; FALLBACK12: {{.*}} -; FALLBACK13: {{.*}} -; FALLBACK14: {{.*}} -; FALLBACK15: {{.*}} -; FALLBACK16: {{.*}} -; FALLBACK17: {{.*}} -; FALLBACK18: {{.*}} -; FALLBACK19: {{.*}} -; FALLBACK2: {{.*}} -; FALLBACK20: {{.*}} -; FALLBACK21: {{.*}} -; FALLBACK22: {{.*}} -; FALLBACK23: {{.*}} -; FALLBACK24: {{.*}} -; FALLBACK25: {{.*}} -; FALLBACK26: {{.*}} -; FALLBACK27: {{.*}} -; FALLBACK28: {{.*}} -; FALLBACK29: {{.*}} -; FALLBACK3: {{.*}} -; FALLBACK30: {{.*}} -; FALLBACK31: {{.*}} -; FALLBACK4: {{.*}} -; FALLBACK5: {{.*}} -; FALLBACK6: {{.*}} -; FALLBACK7: {{.*}} -; FALLBACK8: {{.*}} -; FALLBACK9: {{.*}} ; X64: {{.*}} ; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index f84131dfc879..8c0873492ce4 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -588,61 +588,58 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %al -; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%eax), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -655,50 +652,39 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movb %ah, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -711,51 +697,49 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -768,47 +752,40 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -899,66 +876,62 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $60, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -967,58 +940,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X86-NO-BMI2-HAVE-SHLD-LABEL: shl_16bytes: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: -; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: retl ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: shl_16bytes: @@ -1027,34 +987,32 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi @@ -1072,7 +1030,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1081,57 +1039,45 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: shl_16bytes: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %dl, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %bitOff = load i128, ptr %bitOff.ptr, align 1 @@ -1218,62 +1164,61 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %dh +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dh, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 8(%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ebp) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1286,51 +1231,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -1343,52 +1279,52 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ecx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1401,48 +1337,43 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %dl, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -1459,35 +1390,34 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi @@ -1496,142 +1426,124 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes: @@ -1640,127 +1552,120 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al -; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1775,95 +1680,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp @@ -1879,103 +1756,95 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1988,92 +1857,73 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2089,31 +1939,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: negb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %sil, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx @@ -2146,79 +1996,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %sil -; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rsi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al +; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %al +; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rsi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r8, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rsi), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax @@ -2226,50 +2067,40 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %sil -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %sil, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r10, %rbx, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes: @@ -2278,118 +2109,112 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al +; X86-NO-BMI2-NO-SHLD-NEXT: negb %al +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2398,7 +2223,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2413,99 +2238,70 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ebp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ebp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ebp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ebp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $28, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: negb %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -2519,106 +2315,105 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2631,95 +2426,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $28, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%esi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movsbl %al, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %eax, %edx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%esi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 16(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 20(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2735,36 +2510,36 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi @@ -2773,145 +2548,130 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rsi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rsi), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %sil -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %sil, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes: @@ -2920,17 +2680,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $108, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx @@ -2942,7 +2702,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -2953,95 +2713,94 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al -; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebp,4), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebp,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%ebx,4), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ebx,4), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3088,64 +2847,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $5, %al +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp @@ -3161,106 +2897,101 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $108, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3273,93 +3004,79 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $92, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $5, %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebp,4), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp,4), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp,4), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp,4), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp,4), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp,4), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -3381,6 +3098,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3390,6 +3108,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) @@ -3398,18 +3121,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi @@ -3417,7 +3132,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx @@ -3426,7 +3140,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 @@ -3478,6 +3192,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3488,22 +3203,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -3511,73 +3228,41 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, (%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes: @@ -3588,6 +3273,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3597,6 +3283,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) @@ -3606,52 +3297,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 @@ -3662,10 +3344,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3676,11 +3359,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 @@ -3691,6 +3371,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) @@ -3700,60 +3385,39 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r12, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes: @@ -3762,40 +3426,44 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3806,8 +3474,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3816,214 +3483,199 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -4036,209 +3688,153 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -4252,42 +3848,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -4297,6 +3897,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4307,163 +3908,141 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -4478,7 +4057,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4489,7 +4068,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx @@ -4499,7 +4078,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi @@ -4508,13 +4087,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -4522,9 +4105,10 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -4534,138 +4118,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %ecx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %esi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -4680,7 +4216,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 @@ -4695,6 +4230,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -4703,107 +4243,91 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r14), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r14), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r14), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r14), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r14), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax @@ -4815,7 +4339,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) @@ -4823,77 +4352,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r10), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r10), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r10), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes: @@ -4904,6 +4398,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 @@ -4913,6 +4408,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -4922,68 +4422,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -4994,12 +4484,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 @@ -5009,6 +4496,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -5018,65 +4510,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes: @@ -5085,42 +4552,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5129,6 +4598,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5137,200 +4609,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: subl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 48(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5353,7 +4804,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -5366,213 +4817,153 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 160(%esp,%ebp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -5585,50 +4976,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $216, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5641,179 +5037,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 212(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 36(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 28(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 48(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $216, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -5827,42 +5194,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ebx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ebx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5870,6 +5239,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5882,148 +5254,93 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -6045,6 +5362,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -6072,9 +5390,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi @@ -6082,7 +5400,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx @@ -6091,7 +5408,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 @@ -6143,6 +5460,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6153,22 +5471,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: ; X64-NO-BMI2-HAVE-SHLD: # %bb.0: -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) @@ -6176,74 +5491,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, (%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rdi), %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rdi), %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r14,%r14), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rdi), %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rdi), %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%rbp,%rbp), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r14 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rdi), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rbp -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes: @@ -6254,6 +5545,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -6281,44 +5573,43 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 @@ -6329,10 +5620,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6343,11 +5635,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: ; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 @@ -6376,52 +5665,39 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%rbx,%rbx), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r13,%r13), %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r12, %r15, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r12, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r14, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes: @@ -6430,12 +5706,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $208, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx @@ -6443,7 +5719,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx @@ -6452,19 +5728,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -6473,7 +5749,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -6482,7 +5758,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -6503,196 +5779,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 128(%esp,%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%esi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 136(%esp,%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 140(%esp,%esi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $208, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -6705,7 +5980,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -6718,7 +5993,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx @@ -6726,189 +6001,144 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%esi), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 56(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 52(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -6942,199 +6172,199 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 132(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 128(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 76(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 136(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -7149,7 +6379,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -7158,7 +6388,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx @@ -7170,173 +6400,142 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 112(%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 120(%esp,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 116(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 124(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%edx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 40(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 32(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 16(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 9ae1f270e883..044be12a3954 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -432,30 +432,89 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; -; X86-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movzbl (%esp,%ecx), %ecx -; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %ebx +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-SHLD-NEXT: movb %bl, (%eax) +; X86-SHLD-NEXT: addl $40, %esp +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> @@ -505,30 +564,89 @@ define void @load_2byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; -; X86-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movw %cx, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movw %si, (%eax) +; X86-SHLD-NEXT: addl $40, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> @@ -577,30 +695,89 @@ define void @load_4byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; -; X86-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl %esi, (%eax) +; X86-SHLD-NEXT: addl $40, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> @@ -649,32 +826,128 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; -; X86-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx -; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-NO-BMI2-NO-SHLD-NEXT: notb %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $32, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi +; X86-SHLD-NEXT: movl (%esp,%edx), %edi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %edx, %ebx +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-SHLD-NEXT: movl %edi, (%eax) +; X86-SHLD-NEXT: addl $32, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movdqa %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <8 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <8 x i8> %init, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <16 x i8> %intermediate.sroa.0.0.vec.expand, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> @@ -689,58 +962,123 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movzbl -64(%rsp,%rax), %eax -; X64-NEXT: movb %al, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movzbl (%esp,%ecx), %ecx -; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NEXT: shrb $6, %al +; X64-NO-BMI2-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NEXT: movb %al, (%rdx) +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-BMI2-NEXT: shll $3, %esi +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movl %esi, %eax +; X64-BMI2-NEXT: shrb $6, %al +; X64-BMI2-NEXT: movzbl %al, %eax +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax +; X64-BMI2-NEXT: movb %al, (%rdx) +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-SHLD-NEXT: movb %bl, (%eax) +; X86-SHLD-NEXT: addl $72, %esp +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> @@ -756,58 +1094,136 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movq -64(%rsp,%rax), %rax -; X64-NEXT: movw %ax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movw %cx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NEXT: shrb $6, %al +; X64-NO-BMI2-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-NO-BMI2-NEXT: addl %eax, %eax +; X64-NO-BMI2-NEXT: andb $56, %cl +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NEXT: orl %esi, %eax +; X64-NO-BMI2-NEXT: movw %ax, (%rdx) +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-BMI2-NEXT: shll $3, %esi +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movl %esi, %eax +; X64-BMI2-NEXT: shrb $6, %al +; X64-BMI2-NEXT: movzbl %al, %eax +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI2-NEXT: andb $56, %sil +; X64-BMI2-NEXT: notb %sil +; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-BMI2-NEXT: addl %eax, %eax +; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movw %cx, (%rdx) +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movw %si, (%eax) +; X86-SHLD-NEXT: addl $72, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> @@ -822,58 +1238,136 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movl -64(%rsp,%rax), %eax -; X64-NEXT: movl %eax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NEXT: shrb $6, %al +; X64-NO-BMI2-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-NO-BMI2-NEXT: addl %eax, %eax +; X64-NO-BMI2-NEXT: andb $56, %cl +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NEXT: orl %esi, %eax +; X64-NO-BMI2-NEXT: movl %eax, (%rdx) +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: xorps %xmm1, %xmm1 +; X64-BMI2-NEXT: shll $3, %esi +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movl %esi, %eax +; X64-BMI2-NEXT: shrb $6, %al +; X64-BMI2-NEXT: movzbl %al, %eax +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI2-NEXT: andb $56, %sil +; X64-BMI2-NEXT: notb %sil +; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-BMI2-NEXT: addl %eax, %eax +; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movl %ecx, (%rdx) +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl %esi, (%eax) +; X86-SHLD-NEXT: addl $72, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> @@ -888,60 +1382,191 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movq -64(%rsp,%rax), %rax -; X64-NEXT: movq %rax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-SHLD: # %bb.0: +; X64-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movl %ecx, %eax +; X64-SHLD-NEXT: shrb $6, %al +; X64-SHLD-NEXT: movzbl %al, %eax +; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax +; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi +; X64-SHLD-NEXT: movq %rsi, (%rdx) +; X64-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-NO-BMI2-NO-SHLD-NEXT: notb %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $64, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi +; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %edx, %ebx +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-SHLD-NEXT: movl %edi, (%eax) +; X86-SHLD-NEXT: addl $64, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> @@ -956,70 +1581,288 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movq -64(%rsp,%rax), %rcx -; X64-NEXT: movq -56(%rsp,%rax), %rax -; X64-NEXT: movq %rax, 8(%rdx) -; X64-NEXT: movq %rcx, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %esi -; X86-NEXT: movl 8(%esp,%ecx), %edi -; X86-NEXT: movl 12(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebp +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $92, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %ebx +; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi +; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax +; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi +; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi +; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp +; X86-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-SHLD-NEXT: movl %esi, 8(%edx) +; X86-SHLD-NEXT: movl %edi, 4(%edx) +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl %eax, (%edx) +; X86-SHLD-NEXT: addl $92, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: popl %ebp +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <16 x i8> %init, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <32 x i8> %intermediate.sroa.0.0.vec.expand, <32 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> @@ -1034,84 +1877,155 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i } define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $63, %esi -; X64-NEXT: movzbl -128(%rsp,%rsi), %eax -; X64-NEXT: movb %al, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $128, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andl $63, %ecx -; X86-NEXT: movzbl (%esp,%ecx), %ecx -; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: addl $128, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: pushq %rax +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: andl $56, %ecx +; X64-NO-BMI2-NEXT: andl $56, %esi +; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax +; X64-NO-BMI2-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi +; X64-NO-BMI2-NEXT: addl %esi, %esi +; X64-NO-BMI2-NEXT: notl %ecx +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NEXT: orl %eax, %esi +; X64-NO-BMI2-NEXT: movb %sil, (%rdx) +; X64-NO-BMI2-NEXT: popq %rax +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: pushq %rax +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: leal (,%rsi,8), %eax +; X64-BMI2-NEXT: andl $56, %eax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: notl %eax +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi +; X64-BMI2-NEXT: addl %esi, %esi +; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movb %cl, (%rdx) +; X64-BMI2-NEXT: popq %rax +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: subl $136, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-SHLD-NEXT: andl $60, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %ebx +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-SHLD-NEXT: movb %bl, (%eax) +; X86-SHLD-NEXT: addl $136, %esp +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -1127,84 +2041,155 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $63, %esi -; X64-NEXT: movq -128(%rsp,%rsi), %rax -; X64-NEXT: movw %ax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $128, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andl $63, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movw %cx, (%eax) -; X86-NEXT: addl $128, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: pushq %rax +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: andl $56, %ecx +; X64-NO-BMI2-NEXT: andl $56, %esi +; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax +; X64-NO-BMI2-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi +; X64-NO-BMI2-NEXT: addl %esi, %esi +; X64-NO-BMI2-NEXT: notl %ecx +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NEXT: orl %eax, %esi +; X64-NO-BMI2-NEXT: movw %si, (%rdx) +; X64-NO-BMI2-NEXT: popq %rax +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: pushq %rax +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: leal (,%rsi,8), %eax +; X64-BMI2-NEXT: andl $56, %eax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: notl %eax +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi +; X64-BMI2-NEXT: addl %esi, %esi +; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movw %cx, (%rdx) +; X64-BMI2-NEXT: popq %rax +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $136, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-SHLD-NEXT: andl $60, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movw %si, (%eax) +; X86-SHLD-NEXT: addl $136, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -1219,84 +2204,155 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $63, %esi -; X64-NEXT: movl -128(%rsp,%rsi), %eax -; X64-NEXT: movl %eax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $128, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andl $63, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $128, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: pushq %rax +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: andl $56, %ecx +; X64-NO-BMI2-NEXT: andl $56, %esi +; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax +; X64-NO-BMI2-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi +; X64-NO-BMI2-NEXT: addl %esi, %esi +; X64-NO-BMI2-NEXT: notl %ecx +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NEXT: orl %eax, %esi +; X64-NO-BMI2-NEXT: movl %esi, (%rdx) +; X64-NO-BMI2-NEXT: popq %rax +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: pushq %rax +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: leal (,%rsi,8), %eax +; X64-BMI2-NEXT: andl $56, %eax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: notl %eax +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi +; X64-BMI2-NEXT: addl %esi, %esi +; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movl %ecx, (%rdx) +; X64-BMI2-NEXT: popq %rax +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $136, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-SHLD-NEXT: andl $60, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl %esi, (%eax) +; X86-SHLD-NEXT: addl $136, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $136, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -1311,86 +2367,216 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $63, %esi -; X64-NEXT: movq -128(%rsp,%rsi), %rax -; X64-NEXT: movq %rax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: subl $128, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andl $63, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $128, %esp -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-SHLD: # %bb.0: +; X64-SHLD-NEXT: pushq %rax +; X64-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-SHLD-NEXT: andl $56, %esi +; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax +; X64-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi +; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax +; X64-SHLD-NEXT: movq %rax, (%rdx) +; X64-SHLD-NEXT: popq %rax +; X64-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $140, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $128, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %esi +; X86-SHLD-NEXT: andl $60, %esi +; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi +; X86-SHLD-NEXT: movl (%esp,%esi), %edx +; X86-SHLD-NEXT: movl 4(%esp,%esi), %esi +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: andl $24, %ecx +; X86-SHLD-NEXT: movl %esi, %ebx +; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: addl $128, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -1405,96 +2591,326 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 } define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $63, %esi -; X64-NEXT: movq -128(%rsp,%rsi), %rax -; X64-NEXT: movq -120(%rsp,%rsi), %rcx -; X64-NEXT: movq %rcx, 8(%rdx) -; X64-NEXT: movq %rax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: subl $128, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andl $63, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %esi -; X86-NEXT: movl 8(%esp,%ecx), %edi -; X86-NEXT: movl 12(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $128, %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rdi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebp +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $156, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movl %ecx, %edi +; X86-SHLD-NEXT: andl $60, %edi +; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi +; X86-SHLD-NEXT: movl 16(%esp,%edi), %eax +; X86-SHLD-NEXT: movl 20(%esp,%edi), %ebx +; X86-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: andl $24, %ecx +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-SHLD-NEXT: movl 28(%esp,%edi), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi +; X86-SHLD-NEXT: movl 32(%esp,%edi), %edi +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp +; X86-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-SHLD-NEXT: movl %esi, 8(%edx) +; X86-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl %eax, (%edx) +; X86-SHLD-NEXT: addl $156, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: popl %ebp +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -1509,116 +2925,484 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i } define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: andl $63, %esi -; X64-NEXT: movq -128(%rsp,%rsi), %rax -; X64-NEXT: movq -120(%rsp,%rsi), %rcx -; X64-NEXT: movq -112(%rsp,%rsi), %rdi -; X64-NEXT: movq -104(%rsp,%rsi), %rsi -; X64-NEXT: movq %rsi, 24(%rdx) -; X64-NEXT: movq %rdi, 16(%rdx) -; X64-NEXT: movq %rcx, 8(%rdx) -; X64-NEXT: movq %rax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: subl $136, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movdqu (%ecx), %xmm0 -; X86-NEXT: movdqu 16(%ecx), %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: andl $63, %eax -; X86-NEXT: movl 8(%esp,%eax), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esp,%eax), %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: movl 16(%esp,%eax), %esi -; X86-NEXT: movl 20(%esp,%eax), %edi -; X86-NEXT: movl 24(%esp,%eax), %ebx -; X86-NEXT: movl 28(%esp,%eax), %ebp -; X86-NEXT: movl 32(%esp,%eax), %edx -; X86-NEXT: movl 36(%esp,%eax), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl %edx, 24(%eax) -; X86-NEXT: movl %ebp, 20(%eax) -; X86-NEXT: movl %ebx, 16(%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $136, %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: notb %r8b +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax +; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r14, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %r11, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $172, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebp +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $156, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movl %ecx, %edi +; X86-SHLD-NEXT: andl $60, %edi +; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx +; X86-SHLD-NEXT: movl 20(%esp,%edi), %eax +; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: andl $24, %ecx +; X86-SHLD-NEXT: movl %eax, %esi +; X86-SHLD-NEXT: movl %edx, %eax +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movl 28(%esp,%edi), %edx +; X86-SHLD-NEXT: shrdl %cl, %edx, %eax +; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movl 32(%esp,%edi), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebp +; X86-SHLD-NEXT: movl 40(%esp,%edi), %edx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl 44(%esp,%edi), %eax +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl 16(%esp,%edi), %ebx +; X86-SHLD-NEXT: movl 48(%esp,%edi), %edi +; X86-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-SHLD-NEXT: movl %eax, 28(%edi) +; X86-SHLD-NEXT: movl %edx, 24(%edi) +; X86-SHLD-NEXT: movl %esi, 20(%edi) +; X86-SHLD-NEXT: movl %ebp, 16(%edi) +; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SHLD-NEXT: movl %eax, 12(%edi) +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SHLD-NEXT: movl %eax, 8(%edi) +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SHLD-NEXT: movl %eax, 4(%edi) +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-SHLD-NEXT: movl %ebx, (%edi) +; X86-SHLD-NEXT: addl $156, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: popl %ebp +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %intermediate.sroa.0.0.vecblend = shufflevector <64 x i8> %intermediate.sroa.0.0.vec.expand, <64 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -1633,9 +3417,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X64-HAVE-BMI2-HAVE-SHLD: {{.*}} -; X64-NO-BMI2-HAVE-SHLD: {{.*}} +; X64: {{.*}} ; X64-NO-SHLD: {{.*}} +; X86: {{.*}} ; X86-HAVE-BMI2-HAVE-SHLD: {{.*}} ; X86-NO-BMI2-HAVE-SHLD: {{.*}} ; X86-NO-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 4a47e7613dfa..ff13f4ba577f 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -603,32 +603,86 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movb %sil, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; -; X86-LABEL: load_1byte_chunk_of_16byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movzbl (%esp,%ecx), %ecx -; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %ebx +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-SHLD-NEXT: movb %bl, (%eax) +; X86-SHLD-NEXT: addl $40, %esp +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <16 x i8> %init @@ -711,32 +765,86 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; -; X86-LABEL: load_2byte_chunk_of_16byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movw %cx, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movw %si, (%eax) +; X86-SHLD-NEXT: addl $40, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <16 x i8> %init @@ -818,32 +926,86 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; -; X86-LABEL: load_4byte_chunk_of_16byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $40, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl %esi, (%eax) +; X86-SHLD-NEXT: addl $40, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <16 x i8> %init @@ -925,34 +1087,125 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; -; X86-LABEL: load_8byte_chunk_of_16byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $32, %esp -; X86-NEXT: retl +; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-NO-BMI2-NO-SHLD-NEXT: notb %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $32, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $3, %dl +; X86-SHLD-NEXT: andb $12, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi +; X86-SHLD-NEXT: movl (%esp,%edx), %edi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %edx, %ebx +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-SHLD-NEXT: movl %edi, (%eax) +; X86-SHLD-NEXT: addl $32, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <16 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <16 x i8> %init @@ -967,64 +1220,128 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; no @load_16byte_chunk_of_16byte_alloca define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_1byte_chunk_of_32byte_alloca: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movzbl -64(%rsp,%rax), %eax -; X64-NEXT: movb %al, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_1byte_chunk_of_32byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movzbl (%esp,%ecx), %ecx -; X86-NEXT: movb %cl, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NEXT: shrb $6, %al +; X64-NO-BMI2-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shrq %cl, %rax +; X64-NO-BMI2-NEXT: movb %al, (%rdx) +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-BMI2-NEXT: shll $3, %esi +; X64-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movl %esi, %eax +; X64-BMI2-NEXT: shrb $6, %al +; X64-BMI2-NEXT: movzbl %al, %eax +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax +; X64-BMI2-NEXT: movb %al, (%rdx) +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx,4), %ebx +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-SHLD-NEXT: movb %bl, (%eax) +; X86-SHLD-NEXT: addl $72, %esp +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <32 x i8> %init @@ -1038,64 +1355,141 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) } define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_2byte_chunk_of_32byte_alloca: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movq -64(%rsp,%rax), %rax -; X64-NEXT: movw %ax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_2byte_chunk_of_32byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movw %cx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NEXT: shrb $6, %al +; X64-NO-BMI2-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-NO-BMI2-NEXT: addl %eax, %eax +; X64-NO-BMI2-NEXT: andb $56, %cl +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NEXT: orl %esi, %eax +; X64-NO-BMI2-NEXT: movw %ax, (%rdx) +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-BMI2-NEXT: shll $3, %esi +; X64-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movl %esi, %eax +; X64-BMI2-NEXT: shrb $6, %al +; X64-BMI2-NEXT: movzbl %al, %eax +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI2-NEXT: andb $56, %sil +; X64-BMI2-NEXT: notb %sil +; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-BMI2-NEXT: addl %eax, %eax +; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movw %cx, (%rdx) +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movw %dx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movw %si, (%eax) +; X86-SHLD-NEXT: addl $72, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movw %cx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <32 x i8> %init @@ -1108,64 +1502,141 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) } define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_4byte_chunk_of_32byte_alloca: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movl -64(%rsp,%rax), %eax -; X64-NEXT: movl %eax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_4byte_chunk_of_32byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca: +; X64-NO-BMI2: # %bb.0: +; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NEXT: shrb $6, %al +; X64-NO-BMI2-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-NO-BMI2-NEXT: addl %eax, %eax +; X64-NO-BMI2-NEXT: andb $56, %cl +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NEXT: orl %esi, %eax +; X64-NO-BMI2-NEXT: movl %eax, (%rdx) +; X64-NO-BMI2-NEXT: retq +; +; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca: +; X64-BMI2: # %bb.0: +; X64-BMI2-NEXT: movups (%rdi), %xmm0 +; X64-BMI2-NEXT: movups 16(%rdi), %xmm1 +; X64-BMI2-NEXT: shll $3, %esi +; X64-BMI2-NEXT: xorps %xmm2, %xmm2 +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-BMI2-NEXT: movl %esi, %eax +; X64-BMI2-NEXT: shrb $6, %al +; X64-BMI2-NEXT: movzbl %al, %eax +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI2-NEXT: andb $56, %sil +; X64-BMI2-NEXT: notb %sil +; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-BMI2-NEXT: addl %eax, %eax +; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax +; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: movl %ecx, (%rdx) +; X64-BMI2-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $72, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl (%esp,%edx,4), %esi +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %esi +; X86-SHLD-NEXT: movl %esi, (%eax) +; X86-SHLD-NEXT: addl $72, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <32 x i8> %init @@ -1178,66 +1649,197 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) } define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_8byte_chunk_of_32byte_alloca: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movq -64(%rsp,%rax), %rax -; X64-NEXT: movq %rax, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_8byte_chunk_of_32byte_alloca: -; X86: # %bb.0: -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: +; X64-SHLD: # %bb.0: +; X64-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-SHLD-NEXT: movl %ecx, %eax +; X64-SHLD-NEXT: shrb $6, %al +; X64-SHLD-NEXT: movzbl %al, %eax +; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax +; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi +; X64-SHLD-NEXT: movq %rsi, (%rdx) +; X64-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $76, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-NO-BMI2-NO-SHLD-NEXT: notb %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $76, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $64, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: movups (%edx), %xmm0 +; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-SHLD-NEXT: movl %ecx, %edx +; X86-SHLD-NEXT: shrb $5, %dl +; X86-SHLD-NEXT: movzbl %dl, %edx +; X86-SHLD-NEXT: movl 8(%esp,%edx,4), %esi +; X86-SHLD-NEXT: movl (%esp,%edx,4), %edi +; X86-SHLD-NEXT: movl 4(%esp,%edx,4), %edx +; X86-SHLD-NEXT: movl %edx, %ebx +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-SHLD-NEXT: movl %edi, (%eax) +; X86-SHLD-NEXT: addl $64, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%edx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edx,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $76, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <32 x i8> %init @@ -1250,76 +1852,295 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) } define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind { -; X64-LABEL: load_16byte_chunk_of_32byte_alloca: -; X64: # %bb.0: -; X64-NEXT: movdqu (%rdi), %xmm0 -; X64-NEXT: movdqu 16(%rdi), %xmm1 -; X64-NEXT: shll $3, %esi -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; X64-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm3, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrb $3, %sil -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movq -64(%rsp,%rax), %rcx -; X64-NEXT: movq -56(%rsp,%rax), %rax -; X64-NEXT: movq %rax, 8(%rdx) -; X64-NEXT: movq %rcx, (%rdx) -; X64-NEXT: retq -; -; X86-LABEL: load_16byte_chunk_of_32byte_alloca: -; X86: # %bb.0: -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: subl $64, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; X86-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; X86-NEXT: pshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] -; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: movd %xmm7, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm6, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm5, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm4, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm3, {{[0-9]+}}(%esp) -; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movl (%esp,%ecx), %edx -; X86-NEXT: movl 4(%esp,%ecx), %esi -; X86-NEXT: movl 8(%esp,%ecx), %edi -; X86-NEXT: movl 12(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $64, %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: retl +; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X64-NO-BMI2-NO-SHLD: # %bb.0: +; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: retq +; +; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X64-NO-BMI2-HAVE-SHLD: # %bb.0: +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X64-HAVE-BMI2-NO-SHLD: # %bb.0: +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: retq +; +; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0: +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq +; +; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: subl $92, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edi,4), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi,4), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $92, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: retl +; +; X86-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X86-SHLD: # %bb.0: +; X86-SHLD-NEXT: pushl %ebp +; X86-SHLD-NEXT: pushl %ebx +; X86-SHLD-NEXT: pushl %edi +; X86-SHLD-NEXT: pushl %esi +; X86-SHLD-NEXT: subl $92, %esp +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movups (%eax), %xmm0 +; X86-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-SHLD-NEXT: shll $3, %ecx +; X86-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-SHLD-NEXT: movl %ecx, %eax +; X86-SHLD-NEXT: shrb $5, %al +; X86-SHLD-NEXT: movzbl %al, %ebx +; X86-SHLD-NEXT: movl 24(%esp,%ebx,4), %esi +; X86-SHLD-NEXT: movl 16(%esp,%ebx,4), %eax +; X86-SHLD-NEXT: movl 20(%esp,%ebx,4), %edi +; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-SHLD-NEXT: movl 28(%esp,%ebx,4), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi +; X86-SHLD-NEXT: movl 32(%esp,%ebx,4), %ebx +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SHLD-NEXT: shrdl %cl, %ebx, %ebp +; X86-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-SHLD-NEXT: movl %esi, 8(%edx) +; X86-SHLD-NEXT: movl %edi, 4(%edx) +; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-SHLD-NEXT: movl %eax, (%edx) +; X86-SHLD-NEXT: addl $92, %esp +; X86-SHLD-NEXT: popl %esi +; X86-SHLD-NEXT: popl %edi +; X86-SHLD-NEXT: popl %ebx +; X86-SHLD-NEXT: popl %ebp +; X86-SHLD-NEXT: retl +; +; X86-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: +; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $92, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %byteOff.numbits = shl nuw nsw i64 %byteOff, 3 %intermediate.val.frozen = freeze <32 x i8> %init @@ -1334,7 +2155,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; no @load_32byte_chunk_of_32byte_alloca ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} +; X64: {{.*}} ; X64-NO-SHLD: {{.*}} -; X64-SHLD: {{.*}} +; X86: {{.*}} ; X86-NO-SHLD: {{.*}} -; X86-SHLD: {{.*}} |
