diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2025-03-03 21:51:31 +0700 |
|---|---|---|
| committer | Matt Arsenault <138339+arsenm@users.noreply.github.com> | 2025-03-12 03:10:22 +0000 |
| commit | 34a866b294a5e9a7a0fefae142468b5bc0caa19c (patch) | |
| tree | f403547b8441e11e98b58fcbd9567aee811bf6e4 | |
| parent | 3af1561ecfa19fded69b3a8305e9c1918ad18727 (diff) | |
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll | 1200 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll | 272 |
2 files changed, 744 insertions, 728 deletions
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 8446c31d8792..a39bc6b66866 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -3425,641 +3425,657 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX512-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21 ; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm10 -; AVX512-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16 -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11)) -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15] -; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] -; AVX512-NEXT: vpshufb %xmm14, %xmm15, %xmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa %ymm5, %ymm14 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7] -; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] +; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7] -; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] -; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] -; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm14 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride3_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm10 -; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm15, %xmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm23 ^ (ymm0 & (ymm7 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm22 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7],ymm2[8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm10 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6],ymm13[7],ymm3[8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13,14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm15, %xmm14 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm17 ^ ymm3)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (ymm1 & (ymm23 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm20 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm17 ^ (ymm1 & (ymm3 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm0[2],xmm14[3,4],xmm0[5],xmm14[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm21 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm22 ^ (ymm1 & (ymm19 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm5 & (ymm3 ^ ymm17)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3],xmm14[4],xmm0[5,6],xmm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm7 ^ (ymm5 & (ymm23 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm11 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm11 ^ (ymm15 & (ymm10 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm22 ^ ymm19)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 7672db8ca07a..3c98eba69ae5 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -17522,10 +17522,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebx -; FALLBACK17-NEXT: andl $60, %ebx +; FALLBACK17-NEXT: movl %ecx, %ebp +; FALLBACK17-NEXT: andl $60, %ebp ; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK17-NEXT: subl %ebx, %eax +; FALLBACK17-NEXT: subl %ebp, %eax ; FALLBACK17-NEXT: movl 8(%eax), %esi ; FALLBACK17-NEXT: movl 12(%eax), %edx ; FALLBACK17-NEXT: shll $3, %ecx @@ -17539,23 +17539,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 16(%eax), %edi ; FALLBACK17-NEXT: movl 20(%eax), %esi -; FALLBACK17-NEXT: movl %esi, %ebp -; FALLBACK17-NEXT: shldl %cl, %edi, %ebp -; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %esi, %ebx +; FALLBACK17-NEXT: shldl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 24(%eax), %edi ; FALLBACK17-NEXT: movl 28(%eax), %edx -; FALLBACK17-NEXT: movl %edx, %ebp -; FALLBACK17-NEXT: shldl %cl, %edi, %ebp -; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %edx, %ebx +; FALLBACK17-NEXT: shldl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%eax), %edi ; FALLBACK17-NEXT: movl 36(%eax), %esi -; FALLBACK17-NEXT: movl %esi, %ebp -; FALLBACK17-NEXT: shldl %cl, %edi, %ebp -; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %esi, %ebx +; FALLBACK17-NEXT: shldl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%eax), %edx @@ -17568,45 +17568,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl 56(%eax), %edx ; FALLBACK17-NEXT: movl 60(%eax), %edi ; FALLBACK17-NEXT: shldl %cl, %edx, %edi -; FALLBACK17-NEXT: movl (%eax), %ebp +; FALLBACK17-NEXT: movl (%eax), %ebx ; FALLBACK17-NEXT: movl 52(%eax), %esi ; FALLBACK17-NEXT: shldl %cl, %esi, %edx -; FALLBACK17-NEXT: negl %ebx -; FALLBACK17-NEXT: movl 160(%esp,%ebx), %eax -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK17-NEXT: movl %edx, 56(%ebx) -; FALLBACK17-NEXT: movl %edi, 60(%ebx) +; FALLBACK17-NEXT: negl %ebp +; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK17-NEXT: movl %edx, 56(%ebp) +; FALLBACK17-NEXT: movl %edi, 60(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK17-NEXT: shldl %cl, %ebp, %edx -; FALLBACK17-NEXT: shll %cl, %ebp +; FALLBACK17-NEXT: shldl %cl, %ebx, %edx +; FALLBACK17-NEXT: shll %cl, %ebx ; FALLBACK17-NEXT: shldl %cl, %eax, %esi ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK17-NEXT: shldl %cl, %edi, %eax -; FALLBACK17-NEXT: movl %eax, 48(%ebx) -; FALLBACK17-NEXT: movl %esi, 52(%ebx) +; FALLBACK17-NEXT: movl %eax, 48(%ebp) +; FALLBACK17-NEXT: movl %esi, 52(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebx) +; FALLBACK17-NEXT: movl %eax, 40(%ebp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebx) +; FALLBACK17-NEXT: movl %eax, 44(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebx) +; FALLBACK17-NEXT: movl %eax, 32(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebx) +; FALLBACK17-NEXT: movl %eax, 36(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebx) +; FALLBACK17-NEXT: movl %eax, 24(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebx) +; FALLBACK17-NEXT: movl %eax, 28(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebx) +; FALLBACK17-NEXT: movl %eax, 16(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebx) +; FALLBACK17-NEXT: movl %eax, 20(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebx) +; FALLBACK17-NEXT: movl %eax, 8(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebx) -; FALLBACK17-NEXT: movl %ebp, (%ebx) -; FALLBACK17-NEXT: movl %edx, 4(%ebx) +; FALLBACK17-NEXT: movl %eax, 12(%ebp) +; FALLBACK17-NEXT: movl %ebx, (%ebp) +; FALLBACK17-NEXT: movl %edx, 4(%ebp) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -18227,10 +18227,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebx -; FALLBACK21-NEXT: andl $60, %ebx +; FALLBACK21-NEXT: movl %ecx, %ebp +; FALLBACK21-NEXT: andl $60, %ebp ; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK21-NEXT: subl %ebx, %eax +; FALLBACK21-NEXT: subl %ebp, %eax ; FALLBACK21-NEXT: movl 8(%eax), %esi ; FALLBACK21-NEXT: movl 12(%eax), %edx ; FALLBACK21-NEXT: shll $3, %ecx @@ -18244,23 +18244,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 16(%eax), %edi ; FALLBACK21-NEXT: movl 20(%eax), %esi -; FALLBACK21-NEXT: movl %esi, %ebp -; FALLBACK21-NEXT: shldl %cl, %edi, %ebp -; FALLBACK21-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %esi, %ebx +; FALLBACK21-NEXT: shldl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 24(%eax), %edi ; FALLBACK21-NEXT: movl 28(%eax), %edx -; FALLBACK21-NEXT: movl %edx, %ebp -; FALLBACK21-NEXT: shldl %cl, %edi, %ebp -; FALLBACK21-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %edx, %ebx +; FALLBACK21-NEXT: shldl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 32(%eax), %edi ; FALLBACK21-NEXT: movl 36(%eax), %esi -; FALLBACK21-NEXT: movl %esi, %ebp -; FALLBACK21-NEXT: shldl %cl, %edi, %ebp -; FALLBACK21-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %esi, %ebx +; FALLBACK21-NEXT: shldl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 40(%eax), %edx @@ -18273,45 +18273,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl 56(%eax), %edx ; FALLBACK21-NEXT: movl 60(%eax), %edi ; FALLBACK21-NEXT: shldl %cl, %edx, %edi -; FALLBACK21-NEXT: movl (%eax), %ebp +; FALLBACK21-NEXT: movl (%eax), %ebx ; FALLBACK21-NEXT: movl 52(%eax), %esi ; FALLBACK21-NEXT: shldl %cl, %esi, %edx -; FALLBACK21-NEXT: negl %ebx -; FALLBACK21-NEXT: movl 160(%esp,%ebx), %eax -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK21-NEXT: movl %edx, 56(%ebx) -; FALLBACK21-NEXT: movl %edi, 60(%ebx) +; FALLBACK21-NEXT: negl %ebp +; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK21-NEXT: movl %edx, 56(%ebp) +; FALLBACK21-NEXT: movl %edi, 60(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK21-NEXT: shldl %cl, %ebp, %edx -; FALLBACK21-NEXT: shll %cl, %ebp +; FALLBACK21-NEXT: shldl %cl, %ebx, %edx +; FALLBACK21-NEXT: shll %cl, %ebx ; FALLBACK21-NEXT: shldl %cl, %eax, %esi ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK21-NEXT: shldl %cl, %edi, %eax -; FALLBACK21-NEXT: movl %eax, 48(%ebx) -; FALLBACK21-NEXT: movl %esi, 52(%ebx) +; FALLBACK21-NEXT: movl %eax, 48(%ebp) +; FALLBACK21-NEXT: movl %esi, 52(%ebp) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebx) +; FALLBACK21-NEXT: movl %eax, 40(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebx) +; FALLBACK21-NEXT: movl %eax, 44(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebx) +; FALLBACK21-NEXT: movl %eax, 32(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebx) +; FALLBACK21-NEXT: movl %eax, 36(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebx) +; FALLBACK21-NEXT: movl %eax, 24(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebx) +; FALLBACK21-NEXT: movl %eax, 28(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebx) +; FALLBACK21-NEXT: movl %eax, 16(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebx) +; FALLBACK21-NEXT: movl %eax, 20(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebx) +; FALLBACK21-NEXT: movl %eax, 8(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebx) -; FALLBACK21-NEXT: movl %ebp, (%ebx) -; FALLBACK21-NEXT: movl %edx, 4(%ebx) +; FALLBACK21-NEXT: movl %eax, 12(%ebp) +; FALLBACK21-NEXT: movl %ebx, (%ebp) +; FALLBACK21-NEXT: movl %edx, 4(%ebp) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -18833,10 +18833,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebx -; FALLBACK25-NEXT: andl $60, %ebx +; FALLBACK25-NEXT: movl %ecx, %ebp +; FALLBACK25-NEXT: andl $60, %ebp ; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK25-NEXT: subl %ebx, %eax +; FALLBACK25-NEXT: subl %ebp, %eax ; FALLBACK25-NEXT: movl 8(%eax), %esi ; FALLBACK25-NEXT: movl 12(%eax), %edx ; FALLBACK25-NEXT: shll $3, %ecx @@ -18850,23 +18850,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 16(%eax), %edi ; FALLBACK25-NEXT: movl 20(%eax), %esi -; FALLBACK25-NEXT: movl %esi, %ebp -; FALLBACK25-NEXT: shldl %cl, %edi, %ebp -; FALLBACK25-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %esi, %ebx +; FALLBACK25-NEXT: shldl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 24(%eax), %edi ; FALLBACK25-NEXT: movl 28(%eax), %edx -; FALLBACK25-NEXT: movl %edx, %ebp -; FALLBACK25-NEXT: shldl %cl, %edi, %ebp -; FALLBACK25-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %edx, %ebx +; FALLBACK25-NEXT: shldl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 32(%eax), %edi ; FALLBACK25-NEXT: movl 36(%eax), %esi -; FALLBACK25-NEXT: movl %esi, %ebp -; FALLBACK25-NEXT: shldl %cl, %edi, %ebp -; FALLBACK25-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %esi, %ebx +; FALLBACK25-NEXT: shldl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 40(%eax), %edx @@ -18879,45 +18879,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl 56(%eax), %edx ; FALLBACK25-NEXT: movl 60(%eax), %edi ; FALLBACK25-NEXT: shldl %cl, %edx, %edi -; FALLBACK25-NEXT: movl (%eax), %ebp +; FALLBACK25-NEXT: movl (%eax), %ebx ; FALLBACK25-NEXT: movl 52(%eax), %esi ; FALLBACK25-NEXT: shldl %cl, %esi, %edx -; FALLBACK25-NEXT: negl %ebx -; FALLBACK25-NEXT: movl 160(%esp,%ebx), %eax -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK25-NEXT: movl %edx, 56(%ebx) -; FALLBACK25-NEXT: movl %edi, 60(%ebx) +; FALLBACK25-NEXT: negl %ebp +; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK25-NEXT: movl %edx, 56(%ebp) +; FALLBACK25-NEXT: movl %edi, 60(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK25-NEXT: shldl %cl, %ebp, %edx -; FALLBACK25-NEXT: shll %cl, %ebp +; FALLBACK25-NEXT: shldl %cl, %ebx, %edx +; FALLBACK25-NEXT: shll %cl, %ebx ; FALLBACK25-NEXT: shldl %cl, %eax, %esi ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK25-NEXT: shldl %cl, %edi, %eax -; FALLBACK25-NEXT: movl %eax, 48(%ebx) -; FALLBACK25-NEXT: movl %esi, 52(%ebx) +; FALLBACK25-NEXT: movl %eax, 48(%ebp) +; FALLBACK25-NEXT: movl %esi, 52(%ebp) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebx) +; FALLBACK25-NEXT: movl %eax, 40(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebx) +; FALLBACK25-NEXT: movl %eax, 44(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebx) +; FALLBACK25-NEXT: movl %eax, 32(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebx) +; FALLBACK25-NEXT: movl %eax, 36(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebx) +; FALLBACK25-NEXT: movl %eax, 24(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebx) +; FALLBACK25-NEXT: movl %eax, 28(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebx) +; FALLBACK25-NEXT: movl %eax, 16(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebx) +; FALLBACK25-NEXT: movl %eax, 20(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebx) +; FALLBACK25-NEXT: movl %eax, 8(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebx) -; FALLBACK25-NEXT: movl %ebp, (%ebx) -; FALLBACK25-NEXT: movl %edx, 4(%ebx) +; FALLBACK25-NEXT: movl %eax, 12(%ebp) +; FALLBACK25-NEXT: movl %ebx, (%ebp) +; FALLBACK25-NEXT: movl %edx, 4(%ebp) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -19424,10 +19424,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebx -; FALLBACK29-NEXT: andl $60, %ebx +; FALLBACK29-NEXT: movl %ecx, %ebp +; FALLBACK29-NEXT: andl $60, %ebp ; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK29-NEXT: subl %ebx, %eax +; FALLBACK29-NEXT: subl %ebp, %eax ; FALLBACK29-NEXT: movl 8(%eax), %esi ; FALLBACK29-NEXT: movl 12(%eax), %edx ; FALLBACK29-NEXT: shll $3, %ecx @@ -19441,23 +19441,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 16(%eax), %edi ; FALLBACK29-NEXT: movl 20(%eax), %esi -; FALLBACK29-NEXT: movl %esi, %ebp -; FALLBACK29-NEXT: shldl %cl, %edi, %ebp -; FALLBACK29-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %esi, %ebx +; FALLBACK29-NEXT: shldl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 24(%eax), %edi ; FALLBACK29-NEXT: movl 28(%eax), %edx -; FALLBACK29-NEXT: movl %edx, %ebp -; FALLBACK29-NEXT: shldl %cl, %edi, %ebp -; FALLBACK29-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %edx, %ebx +; FALLBACK29-NEXT: shldl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 32(%eax), %edi ; FALLBACK29-NEXT: movl 36(%eax), %esi -; FALLBACK29-NEXT: movl %esi, %ebp -; FALLBACK29-NEXT: shldl %cl, %edi, %ebp -; FALLBACK29-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %esi, %ebx +; FALLBACK29-NEXT: shldl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 40(%eax), %edx @@ -19470,45 +19470,45 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl 56(%eax), %edx ; FALLBACK29-NEXT: movl 60(%eax), %edi ; FALLBACK29-NEXT: shldl %cl, %edx, %edi -; FALLBACK29-NEXT: movl (%eax), %ebp +; FALLBACK29-NEXT: movl (%eax), %ebx ; FALLBACK29-NEXT: movl 52(%eax), %esi ; FALLBACK29-NEXT: shldl %cl, %esi, %edx -; FALLBACK29-NEXT: negl %ebx -; FALLBACK29-NEXT: movl 160(%esp,%ebx), %eax -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebx -; FALLBACK29-NEXT: movl %edx, 56(%ebx) -; FALLBACK29-NEXT: movl %edi, 60(%ebx) +; FALLBACK29-NEXT: negl %ebp +; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK29-NEXT: movl %edx, 56(%ebp) +; FALLBACK29-NEXT: movl %edi, 60(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK29-NEXT: shldl %cl, %ebp, %edx -; FALLBACK29-NEXT: shll %cl, %ebp +; FALLBACK29-NEXT: shldl %cl, %ebx, %edx +; FALLBACK29-NEXT: shll %cl, %ebx ; FALLBACK29-NEXT: shldl %cl, %eax, %esi ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK29-NEXT: shldl %cl, %edi, %eax -; FALLBACK29-NEXT: movl %eax, 48(%ebx) -; FALLBACK29-NEXT: movl %esi, 52(%ebx) +; FALLBACK29-NEXT: movl %eax, 48(%ebp) +; FALLBACK29-NEXT: movl %esi, 52(%ebp) ; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebx) +; FALLBACK29-NEXT: movl %eax, 40(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebx) +; FALLBACK29-NEXT: movl %eax, 44(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebx) +; FALLBACK29-NEXT: movl %eax, 32(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebx) +; FALLBACK29-NEXT: movl %eax, 36(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebx) +; FALLBACK29-NEXT: movl %eax, 24(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebx) +; FALLBACK29-NEXT: movl %eax, 28(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebx) +; FALLBACK29-NEXT: movl %eax, 16(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebx) +; FALLBACK29-NEXT: movl %eax, 20(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebx) +; FALLBACK29-NEXT: movl %eax, 8(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebx) -; FALLBACK29-NEXT: movl %ebp, (%ebx) -; FALLBACK29-NEXT: movl %edx, 4(%ebx) +; FALLBACK29-NEXT: movl %eax, 12(%ebp) +; FALLBACK29-NEXT: movl %ebx, (%ebp) +; FALLBACK29-NEXT: movl %edx, 4(%ebp) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi |
