summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2025-03-03 21:51:31 +0700
committerMatt Arsenault <138339+arsenm@users.noreply.github.com>2025-03-12 03:10:22 +0000
commit34a866b294a5e9a7a0fefae142468b5bc0caa19c (patch)
treef403547b8441e11e98b58fcbd9567aee811bf6e4
parent3af1561ecfa19fded69b3a8305e9c1918ad18727 (diff)
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll1200
-rw-r--r--llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll272
2 files changed, 744 insertions, 728 deletions
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 8446c31d8792..a39bc6b66866 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -3425,641 +3425,657 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride3_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm23
-; AVX512-NEXT: vmovdqa 192(%rdi), %ymm7
-; AVX512-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23))
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX512-NEXT: vmovdqa 256(%rdi), %xmm9
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm19
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
+; AVX512-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21
; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm10
-; AVX512-NEXT: vmovdqa 304(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX512-NEXT: vmovdqa 304(%rdi), %xmm8
; AVX512-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
-; AVX512-NEXT: vmovdqa64 %xmm4, %xmm24
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX512-NEXT: vmovdqa %ymm5, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11))
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15]
-; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX512-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7]
-; AVX512-NEXT: vpshufb %xmm14, %xmm15, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT: vmovdqa %ymm5, %ymm14
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15]
-; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm14
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7]
-; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
+; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-NEXT: vmovdqa %ymm0, %ymm10
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25
+; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7]
-; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7))
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
-; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10))
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
-; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7]
-; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11))
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
-; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19))
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
-; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride3_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm10
-; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm8
; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm24
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm12
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm14
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm14
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride3_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm23
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm7
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm19
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21
; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm10
-; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm8
; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm24
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15]
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm12
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm15, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm14
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm14
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm8
; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm24
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 7672db8ca07a..3c98eba69ae5 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -17522,10 +17522,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ecx, %ebx
-; FALLBACK17-NEXT: andl $60, %ebx
+; FALLBACK17-NEXT: movl %ecx, %ebp
+; FALLBACK17-NEXT: andl $60, %ebp
; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: subl %ebx, %eax
+; FALLBACK17-NEXT: subl %ebp, %eax
; FALLBACK17-NEXT: movl 8(%eax), %esi
; FALLBACK17-NEXT: movl 12(%eax), %edx
; FALLBACK17-NEXT: shll $3, %ecx
@@ -17539,23 +17539,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: movl 16(%eax), %edi
; FALLBACK17-NEXT: movl 20(%eax), %esi
-; FALLBACK17-NEXT: movl %esi, %ebp
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: shldl %cl, %edx, %edi
; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: movl 24(%eax), %edi
; FALLBACK17-NEXT: movl 28(%eax), %edx
-; FALLBACK17-NEXT: movl %edx, %ebp
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %edx, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: shldl %cl, %esi, %edi
; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: movl 32(%eax), %edi
; FALLBACK17-NEXT: movl 36(%eax), %esi
-; FALLBACK17-NEXT: movl %esi, %ebp
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT: movl %esi, %ebx
+; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: shldl %cl, %edx, %edi
; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK17-NEXT: movl 40(%eax), %edx
@@ -17568,45 +17568,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK17-NEXT: movl 56(%eax), %edx
; FALLBACK17-NEXT: movl 60(%eax), %edi
; FALLBACK17-NEXT: shldl %cl, %edx, %edi
-; FALLBACK17-NEXT: movl (%eax), %ebp
+; FALLBACK17-NEXT: movl (%eax), %ebx
; FALLBACK17-NEXT: movl 52(%eax), %esi
; FALLBACK17-NEXT: shldl %cl, %esi, %edx
-; FALLBACK17-NEXT: negl %ebx
-; FALLBACK17-NEXT: movl 160(%esp,%ebx), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK17-NEXT: movl %edx, 56(%ebx)
-; FALLBACK17-NEXT: movl %edi, 60(%ebx)
+; FALLBACK17-NEXT: negl %ebp
+; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK17-NEXT: movl %edx, 56(%ebp)
+; FALLBACK17-NEXT: movl %edi, 60(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shldl %cl, %ebp, %edx
-; FALLBACK17-NEXT: shll %cl, %ebp
+; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK17-NEXT: shll %cl, %ebx
; FALLBACK17-NEXT: shldl %cl, %eax, %esi
; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK17-NEXT: shldl %cl, %edi, %eax
-; FALLBACK17-NEXT: movl %eax, 48(%ebx)
-; FALLBACK17-NEXT: movl %esi, 52(%ebx)
+; FALLBACK17-NEXT: movl %eax, 48(%ebp)
+; FALLBACK17-NEXT: movl %esi, 52(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 40(%ebx)
+; FALLBACK17-NEXT: movl %eax, 40(%ebp)
; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 44(%ebx)
+; FALLBACK17-NEXT: movl %eax, 44(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 32(%ebx)
+; FALLBACK17-NEXT: movl %eax, 32(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 36(%ebx)
+; FALLBACK17-NEXT: movl %eax, 36(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 24(%ebx)
+; FALLBACK17-NEXT: movl %eax, 24(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 28(%ebx)
+; FALLBACK17-NEXT: movl %eax, 28(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 16(%ebx)
+; FALLBACK17-NEXT: movl %eax, 16(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 20(%ebx)
+; FALLBACK17-NEXT: movl %eax, 20(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 8(%ebx)
+; FALLBACK17-NEXT: movl %eax, 8(%ebp)
; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 12(%ebx)
-; FALLBACK17-NEXT: movl %ebp, (%ebx)
-; FALLBACK17-NEXT: movl %edx, 4(%ebx)
+; FALLBACK17-NEXT: movl %eax, 12(%ebp)
+; FALLBACK17-NEXT: movl %ebx, (%ebp)
+; FALLBACK17-NEXT: movl %edx, 4(%ebp)
; FALLBACK17-NEXT: addl $188, %esp
; FALLBACK17-NEXT: popl %esi
; FALLBACK17-NEXT: popl %edi
@@ -18227,10 +18227,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %ecx, %ebx
-; FALLBACK21-NEXT: andl $60, %ebx
+; FALLBACK21-NEXT: movl %ecx, %ebp
+; FALLBACK21-NEXT: andl $60, %ebp
; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: subl %ebx, %eax
+; FALLBACK21-NEXT: subl %ebp, %eax
; FALLBACK21-NEXT: movl 8(%eax), %esi
; FALLBACK21-NEXT: movl 12(%eax), %edx
; FALLBACK21-NEXT: shll $3, %ecx
@@ -18244,23 +18244,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: movl 16(%eax), %edi
; FALLBACK21-NEXT: movl 20(%eax), %esi
-; FALLBACK21-NEXT: movl %esi, %ebp
-; FALLBACK21-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK21-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: shldl %cl, %edx, %edi
; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: movl 24(%eax), %edi
; FALLBACK21-NEXT: movl 28(%eax), %edx
-; FALLBACK21-NEXT: movl %edx, %ebp
-; FALLBACK21-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK21-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %edx, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: shldl %cl, %esi, %edi
; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: movl 32(%eax), %edi
; FALLBACK21-NEXT: movl 36(%eax), %esi
-; FALLBACK21-NEXT: movl %esi, %ebp
-; FALLBACK21-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK21-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT: movl %esi, %ebx
+; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: shldl %cl, %edx, %edi
; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK21-NEXT: movl 40(%eax), %edx
@@ -18273,45 +18273,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK21-NEXT: movl 56(%eax), %edx
; FALLBACK21-NEXT: movl 60(%eax), %edi
; FALLBACK21-NEXT: shldl %cl, %edx, %edi
-; FALLBACK21-NEXT: movl (%eax), %ebp
+; FALLBACK21-NEXT: movl (%eax), %ebx
; FALLBACK21-NEXT: movl 52(%eax), %esi
; FALLBACK21-NEXT: shldl %cl, %esi, %edx
-; FALLBACK21-NEXT: negl %ebx
-; FALLBACK21-NEXT: movl 160(%esp,%ebx), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK21-NEXT: movl %edx, 56(%ebx)
-; FALLBACK21-NEXT: movl %edi, 60(%ebx)
+; FALLBACK21-NEXT: negl %ebp
+; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK21-NEXT: movl %edx, 56(%ebp)
+; FALLBACK21-NEXT: movl %edi, 60(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK21-NEXT: shldl %cl, %ebp, %edx
-; FALLBACK21-NEXT: shll %cl, %ebp
+; FALLBACK21-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK21-NEXT: shll %cl, %ebx
; FALLBACK21-NEXT: shldl %cl, %eax, %esi
; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK21-NEXT: shldl %cl, %edi, %eax
-; FALLBACK21-NEXT: movl %eax, 48(%ebx)
-; FALLBACK21-NEXT: movl %esi, 52(%ebx)
+; FALLBACK21-NEXT: movl %eax, 48(%ebp)
+; FALLBACK21-NEXT: movl %esi, 52(%ebp)
; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 40(%ebx)
+; FALLBACK21-NEXT: movl %eax, 40(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 44(%ebx)
+; FALLBACK21-NEXT: movl %eax, 44(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 32(%ebx)
+; FALLBACK21-NEXT: movl %eax, 32(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 36(%ebx)
+; FALLBACK21-NEXT: movl %eax, 36(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 24(%ebx)
+; FALLBACK21-NEXT: movl %eax, 24(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 28(%ebx)
+; FALLBACK21-NEXT: movl %eax, 28(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 16(%ebx)
+; FALLBACK21-NEXT: movl %eax, 16(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 20(%ebx)
+; FALLBACK21-NEXT: movl %eax, 20(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 8(%ebx)
+; FALLBACK21-NEXT: movl %eax, 8(%ebp)
; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 12(%ebx)
-; FALLBACK21-NEXT: movl %ebp, (%ebx)
-; FALLBACK21-NEXT: movl %edx, 4(%ebx)
+; FALLBACK21-NEXT: movl %eax, 12(%ebp)
+; FALLBACK21-NEXT: movl %ebx, (%ebp)
+; FALLBACK21-NEXT: movl %edx, 4(%ebp)
; FALLBACK21-NEXT: addl $188, %esp
; FALLBACK21-NEXT: popl %esi
; FALLBACK21-NEXT: popl %edi
@@ -18833,10 +18833,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %ecx, %ebx
-; FALLBACK25-NEXT: andl $60, %ebx
+; FALLBACK25-NEXT: movl %ecx, %ebp
+; FALLBACK25-NEXT: andl $60, %ebp
; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: subl %ebx, %eax
+; FALLBACK25-NEXT: subl %ebp, %eax
; FALLBACK25-NEXT: movl 8(%eax), %esi
; FALLBACK25-NEXT: movl 12(%eax), %edx
; FALLBACK25-NEXT: shll $3, %ecx
@@ -18850,23 +18850,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: movl 16(%eax), %edi
; FALLBACK25-NEXT: movl 20(%eax), %esi
-; FALLBACK25-NEXT: movl %esi, %ebp
-; FALLBACK25-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK25-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: shldl %cl, %edx, %edi
; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: movl 24(%eax), %edi
; FALLBACK25-NEXT: movl 28(%eax), %edx
-; FALLBACK25-NEXT: movl %edx, %ebp
-; FALLBACK25-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK25-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %edx, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: shldl %cl, %esi, %edi
; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: movl 32(%eax), %edi
; FALLBACK25-NEXT: movl 36(%eax), %esi
-; FALLBACK25-NEXT: movl %esi, %ebp
-; FALLBACK25-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK25-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT: movl %esi, %ebx
+; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: shldl %cl, %edx, %edi
; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK25-NEXT: movl 40(%eax), %edx
@@ -18879,45 +18879,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK25-NEXT: movl 56(%eax), %edx
; FALLBACK25-NEXT: movl 60(%eax), %edi
; FALLBACK25-NEXT: shldl %cl, %edx, %edi
-; FALLBACK25-NEXT: movl (%eax), %ebp
+; FALLBACK25-NEXT: movl (%eax), %ebx
; FALLBACK25-NEXT: movl 52(%eax), %esi
; FALLBACK25-NEXT: shldl %cl, %esi, %edx
-; FALLBACK25-NEXT: negl %ebx
-; FALLBACK25-NEXT: movl 160(%esp,%ebx), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK25-NEXT: movl %edx, 56(%ebx)
-; FALLBACK25-NEXT: movl %edi, 60(%ebx)
+; FALLBACK25-NEXT: negl %ebp
+; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK25-NEXT: movl %edx, 56(%ebp)
+; FALLBACK25-NEXT: movl %edi, 60(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK25-NEXT: shldl %cl, %ebp, %edx
-; FALLBACK25-NEXT: shll %cl, %ebp
+; FALLBACK25-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK25-NEXT: shll %cl, %ebx
; FALLBACK25-NEXT: shldl %cl, %eax, %esi
; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK25-NEXT: shldl %cl, %edi, %eax
-; FALLBACK25-NEXT: movl %eax, 48(%ebx)
-; FALLBACK25-NEXT: movl %esi, 52(%ebx)
+; FALLBACK25-NEXT: movl %eax, 48(%ebp)
+; FALLBACK25-NEXT: movl %esi, 52(%ebp)
; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 40(%ebx)
+; FALLBACK25-NEXT: movl %eax, 40(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 44(%ebx)
+; FALLBACK25-NEXT: movl %eax, 44(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 32(%ebx)
+; FALLBACK25-NEXT: movl %eax, 32(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 36(%ebx)
+; FALLBACK25-NEXT: movl %eax, 36(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 24(%ebx)
+; FALLBACK25-NEXT: movl %eax, 24(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 28(%ebx)
+; FALLBACK25-NEXT: movl %eax, 28(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 16(%ebx)
+; FALLBACK25-NEXT: movl %eax, 16(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 20(%ebx)
+; FALLBACK25-NEXT: movl %eax, 20(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 8(%ebx)
+; FALLBACK25-NEXT: movl %eax, 8(%ebp)
; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 12(%ebx)
-; FALLBACK25-NEXT: movl %ebp, (%ebx)
-; FALLBACK25-NEXT: movl %edx, 4(%ebx)
+; FALLBACK25-NEXT: movl %eax, 12(%ebp)
+; FALLBACK25-NEXT: movl %ebx, (%ebp)
+; FALLBACK25-NEXT: movl %edx, 4(%ebp)
; FALLBACK25-NEXT: addl $188, %esp
; FALLBACK25-NEXT: popl %esi
; FALLBACK25-NEXT: popl %edi
@@ -19424,10 +19424,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %ecx, %ebx
-; FALLBACK29-NEXT: andl $60, %ebx
+; FALLBACK29-NEXT: movl %ecx, %ebp
+; FALLBACK29-NEXT: andl $60, %ebp
; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: subl %ebx, %eax
+; FALLBACK29-NEXT: subl %ebp, %eax
; FALLBACK29-NEXT: movl 8(%eax), %esi
; FALLBACK29-NEXT: movl 12(%eax), %edx
; FALLBACK29-NEXT: shll $3, %ecx
@@ -19441,23 +19441,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: movl 16(%eax), %edi
; FALLBACK29-NEXT: movl 20(%eax), %esi
-; FALLBACK29-NEXT: movl %esi, %ebp
-; FALLBACK29-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK29-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: shldl %cl, %edx, %edi
; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: movl 24(%eax), %edi
; FALLBACK29-NEXT: movl 28(%eax), %edx
-; FALLBACK29-NEXT: movl %edx, %ebp
-; FALLBACK29-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK29-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %edx, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: shldl %cl, %esi, %edi
; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: movl 32(%eax), %edi
; FALLBACK29-NEXT: movl 36(%eax), %esi
-; FALLBACK29-NEXT: movl %esi, %ebp
-; FALLBACK29-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK29-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT: movl %esi, %ebx
+; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
+; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: shldl %cl, %edx, %edi
; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK29-NEXT: movl 40(%eax), %edx
@@ -19470,45 +19470,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK29-NEXT: movl 56(%eax), %edx
; FALLBACK29-NEXT: movl 60(%eax), %edi
; FALLBACK29-NEXT: shldl %cl, %edx, %edi
-; FALLBACK29-NEXT: movl (%eax), %ebp
+; FALLBACK29-NEXT: movl (%eax), %ebx
; FALLBACK29-NEXT: movl 52(%eax), %esi
; FALLBACK29-NEXT: shldl %cl, %esi, %edx
-; FALLBACK29-NEXT: negl %ebx
-; FALLBACK29-NEXT: movl 160(%esp,%ebx), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK29-NEXT: movl %edx, 56(%ebx)
-; FALLBACK29-NEXT: movl %edi, 60(%ebx)
+; FALLBACK29-NEXT: negl %ebp
+; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax
+; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK29-NEXT: movl %edx, 56(%ebp)
+; FALLBACK29-NEXT: movl %edi, 60(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK29-NEXT: shldl %cl, %ebp, %edx
-; FALLBACK29-NEXT: shll %cl, %ebp
+; FALLBACK29-NEXT: shldl %cl, %ebx, %edx
+; FALLBACK29-NEXT: shll %cl, %ebx
; FALLBACK29-NEXT: shldl %cl, %eax, %esi
; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; FALLBACK29-NEXT: shldl %cl, %edi, %eax
-; FALLBACK29-NEXT: movl %eax, 48(%ebx)
-; FALLBACK29-NEXT: movl %esi, 52(%ebx)
+; FALLBACK29-NEXT: movl %eax, 48(%ebp)
+; FALLBACK29-NEXT: movl %esi, 52(%ebp)
; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 40(%ebx)
+; FALLBACK29-NEXT: movl %eax, 40(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 44(%ebx)
+; FALLBACK29-NEXT: movl %eax, 44(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 32(%ebx)
+; FALLBACK29-NEXT: movl %eax, 32(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 36(%ebx)
+; FALLBACK29-NEXT: movl %eax, 36(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 24(%ebx)
+; FALLBACK29-NEXT: movl %eax, 24(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 28(%ebx)
+; FALLBACK29-NEXT: movl %eax, 28(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 16(%ebx)
+; FALLBACK29-NEXT: movl %eax, 16(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 20(%ebx)
+; FALLBACK29-NEXT: movl %eax, 20(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 8(%ebx)
+; FALLBACK29-NEXT: movl %eax, 8(%ebp)
; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 12(%ebx)
-; FALLBACK29-NEXT: movl %ebp, (%ebx)
-; FALLBACK29-NEXT: movl %edx, 4(%ebx)
+; FALLBACK29-NEXT: movl %eax, 12(%ebp)
+; FALLBACK29-NEXT: movl %ebx, (%ebp)
+; FALLBACK29-NEXT: movl %edx, 4(%ebp)
; FALLBACK29-NEXT: addl $188, %esp
; FALLBACK29-NEXT: popl %esi
; FALLBACK29-NEXT: popl %edi