Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format

author: Mingming Liu <mingmingl@google.com> 2025-09-10 15:25:31 -0700
committer: GitHub <noreply@github.com> 2025-09-10 15:25:31 -0700
commit: 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree: 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/test/CodeGen/X86
parent: 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parent: b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)
107 files changed, 4894 insertions, 1410 deletions
diff --git a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
index 8d690ba06e3b..654169377609 100644
--- a/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@@ -13,25 +13,24 @@ define fastcc void @mp_sqrt(i32 %n, i32 %radix, ptr %in, ptr %out, ptr %tmp1, pt
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    pushl %eax
-; CHECK-NEXT:    movb $1, %cl
+; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    movl $1, %ebx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %bb.i5
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    addl %ebx, %ebx
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb $1, %cl
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %mp_unexp_mp2d.exit.i
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.5: # %cond_next.i
-; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    jne .LBB0_3
 ; CHECK-NEXT:  # %bb.6: # %cond_next36.i
 ; CHECK-NEXT:    movl $0, 0
-; CHECK-NEXT:    movzbl %al, %ebp
+; CHECK-NEXT:    movzbl %cl, %ebp
 ; CHECK-NEXT:    andl $1, %ebp
 ; CHECK-NEXT:    xorpd %xmm0, %xmm0
 ; CHECK-NEXT:    xorl %eax, %eax
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index bf939c413108..3913e93b83a6 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -38,7 +38,6 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:  ## %bb.1: ## %bb116.i
 ; CHECK-NEXT:    je LBB0_25
 ; CHECK-NEXT:  ## %bb.2: ## %bb52.i.i
-; CHECK-NEXT:    testb $1, %bl
 ; CHECK-NEXT:    je LBB0_25
 ; CHECK-NEXT:  ## %bb.3: ## %bb142.i
 ; CHECK-NEXT:    je LBB0_25
@@ -49,23 +48,23 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    jmp LBB0_5
 ; CHECK-NEXT:  LBB0_21: ## %bb7806
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp16:
+; CHECK-NEXT:  Ltmp16: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN12wxStringBase6appendEmw
-; CHECK-NEXT:  Ltmp17:
+; CHECK-NEXT:  Ltmp17: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_5: ## %bb3261
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpl $37, 0
 ; CHECK-NEXT:    jne LBB0_25
 ; CHECK-NEXT:  ## %bb.6: ## %bb3306
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:  Ltmp0: ## EH_LABEL
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN12wxStringBaseaSEPKw
-; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:  Ltmp1: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.7: ## %bb3314
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    movl 0, %eax
@@ -89,11 +88,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    je LBB0_14
 ; CHECK-NEXT:  ## %bb.13: ## %bb155.i8541
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:  Ltmp4: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll _gmtime_r
-; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:  Ltmp5: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_14: ## %bb182.i8560
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    testb $1, %bl
@@ -103,7 +102,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    je LBB0_18
 ; CHECK-NEXT:  ## %bb.17: ## %bb440.i8663
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp6:
+; CHECK-NEXT:  Ltmp6: ## EH_LABEL
 ; CHECK-NEXT:    movl L_.str4$non_lazy_ptr, %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl L_.str33$non_lazy_ptr, %eax
@@ -113,47 +112,47 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    movl %ebp, (%esp)
 ; CHECK-NEXT:    movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5
 ; CHECK-NEXT:    calll __Z10wxOnAssertPKwiPKcS0_S0_
-; CHECK-NEXT:  Ltmp7:
+; CHECK-NEXT:  Ltmp7: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_18
 ; CHECK-NEXT:  LBB0_15: ## %bb187.i8591
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    jne LBB0_25
 ; CHECK-NEXT:  LBB0_18: ## %invcont5814
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp8:
+; CHECK-NEXT:  Ltmp8: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN8wxString6FormatEPKwz
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:  Ltmp9:
+; CHECK-NEXT:  Ltmp9: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.19: ## %invcont5831
 ; CHECK-NEXT:    ## in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:  Ltmp10:
+; CHECK-NEXT:  Ltmp10: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN12wxStringBase10ConcatSelfEmPKwm
-; CHECK-NEXT:  Ltmp11:
+; CHECK-NEXT:  Ltmp11: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_5
 ; CHECK-NEXT:  LBB0_9: ## %bb5657
-; CHECK-NEXT:  Ltmp13:
+; CHECK-NEXT:  Ltmp13: ## EH_LABEL
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
-; CHECK-NEXT:  Ltmp14:
+; CHECK-NEXT:  Ltmp14: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_20: ## %bb5968
-; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:  Ltmp2: ## EH_LABEL
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    calll __ZN8wxString6FormatEPKwz
 ; CHECK-NEXT:    subl $4, %esp
-; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:  Ltmp3: ## EH_LABEL
 ; CHECK-NEXT:  LBB0_25: ## %bb115.critedge.i
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    addl $28, %esp
@@ -163,13 +162,13 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl $4
 ; CHECK-NEXT:  LBB0_23: ## %lpad.loopexit.split-lp
-; CHECK-NEXT:  Ltmp15:
+; CHECK-NEXT:  Ltmp15: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_24: ## %lpad8185
-; CHECK-NEXT:  Ltmp12:
+; CHECK-NEXT:  Ltmp12: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_22: ## %lpad.loopexit
-; CHECK-NEXT:  Ltmp18:
+; CHECK-NEXT:  Ltmp18: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  Lfunc_end0:
 entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
index 320c96535abb..2bda8db04029 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -139,12 +139,12 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; O0-NEXT:    callq foo
 ; O0-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
 ; O0-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
+; O0-NEXT:    movl $32, %esi
+; O0-NEXT:    movl $buf+2048, %edx
 ; O0-NEXT:    # implicit-def: $al
 ; O0-NEXT:    movb %al, {{[0-9]+}}(%rsp)
 ; O0-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
 ; O0-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; O0-NEXT:    movl $32, %esi
-; O0-NEXT:    movl $buf+2048, %edx
 ; O0-NEXT:    tileloadd (%rdx,%rsi), %tmm0
 ; O0-NEXT:    movl $64, %esi
 ; O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
diff --git a/llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir b/llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir
new file mode 100644
index 000000000000..82049dce8a45
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-sink-config-after-calls.mir
@@ -0,0 +1,152 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-- -mattr=+amx-int8,avx512f -run-pass="fastpretileconfig,regallocfast,fasttileconfig" -verify-machineinstrs -o - %s | FileCheck %s
+
+# Test to verify that ldtilecfg instructions are sinked closer to tile defining
+# instructions after a call. This ensures call does not overwrite values in
+# registers being used for configuring the AMX tile.
+
+...
+---
+name:            test_api
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 2, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 3, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 4, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 5, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 6, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 9, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 10, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 13, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 14, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 15, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 18, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 19, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 22, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 23, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 24, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 25, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 26, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 29, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 30, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 33, class: tile, preferred-register: '', flags: [  ] }
+  - { id: 34, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 35, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 38, class: gr64_nosp, preferred-register: '', flags: [  ] }
+  - { id: 39, class: gr64, preferred-register: '', flags: [  ] }
+  - { id: 40, class: gr16, preferred-register: '', flags: [  ] }
+  - { id: 41, class: gr16, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$rsi', virtual-reg: '%2' }
+frameInfo:
+  adjustsStack:    true
+  maxAlignment:    1024
+stack:
+  - { id: 0, size: 1024, alignment: 1024 }
+  - { id: 1, size: 1024, alignment: 1024 }
+  - { id: 2, size: 32, alignment: 32 }
+  - { id: 3, size: 32, alignment: 32 }
+  - { id: 4, size: 8, alignment: 8 }
+machineFunctionInfo:
+  amxProgModel:    ManagedRA
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+
+    ; CHECK-LABEL: name: test_api
+    ; CHECK: liveins: $rdi, $rsi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
+    ; CHECK-NEXT: VMOVUPSZmr %stack.5, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.5, align 4)
+    ; CHECK-NEXT: MOV8mi %stack.5, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.5, align 4)
+    ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, $rsi :: (store (s64) into %stack.8)
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 16
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.2, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $cx = MOV16ri 16
+    ; CHECK-NEXT: MOV16mr %stack.7, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.7)
+    ; CHECK-NEXT: renamable $ax = MOV16ri 2
+    ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $ax :: (store (s16) into %stack.6)
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: PLDTILECFGV %stack.5, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.5, align 4)
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rdx = MOV32ri64 16
+    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdi, 1, killed renamable $rdx, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def dead $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: CALL64pcrel32 &foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax
+    ; CHECK-NEXT: $rsi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
+    ; CHECK-NEXT: $cx = MOV16rm %stack.7, 1, $noreg, 0, $noreg :: (load (s16) from %stack.7)
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def dead $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: renamable $rdx = COPY $rax
+    ; CHECK-NEXT: $ax = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
+    ; CHECK-NEXT: MOV64mr killed renamable $rsi, 1, $noreg, 0, $noreg, killed renamable $rdx
+    ; CHECK-NEXT: renamable $rdx = MOV64rm %stack.4, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 16
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: $al = IMPLICIT_DEF
+    ; CHECK-NEXT: MOV8mr %stack.5, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.5 + 48, align 4)
+    ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.5 + 16, align 4)
+    ; CHECK-NEXT: PLDTILECFGV %stack.5, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.5, align 4)
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 64
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg
+    ; CHECK-NEXT: renamable $rsi = MOV32ri64 16
+    ; CHECK-NEXT: renamable $rdx = LEA64r %stack.4, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
+    ; CHECK-NEXT: RET64
+    %2:gr64 = COPY $rsi
+    %0:gr64 = COPY $rdi
+    %1:gr64 = COPY killed %0
+    %3:gr64 = COPY killed %2
+    %38:gr64_nosp = MOV32ri64 16
+    %39:gr64 = LEA64r %stack.2, 1, $noreg, 0, $noreg
+    %40:gr16 = MOV16ri 16
+    %41:gr16 = MOV16ri 2
+    %33:tile = PTILELOADDV %41:gr16, %40:gr16, killed %39, 1, killed %38, 0, $noreg
+    %34:gr64_nosp = MOV32ri64 64
+    %35:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    PTILESTOREDV %41:gr16, %40:gr16, killed %35, 1, killed %34, 0, $noreg, %33
+    %29:gr64_nosp = MOV32ri64 64
+    %30:gr64 = LEA64r %stack.1, 1, $noreg, 0, $noreg
+    %25:tile = PTILELOADDV %41:gr16, %40:gr16, killed %30, 1, killed %29, 0, $noreg
+    %26:gr64_nosp = MOV32ri64 16
+    PTILESTOREDV %41:gr16, %40:gr16, %1, 1, killed %26, 0, $noreg, %25
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    CALL64pcrel32 &foo, csr_64, implicit $rsp, implicit $ssp, implicit-def $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def $rsp, implicit-def $eflags, implicit-def $ssp, implicit $rsp, implicit $ssp
+    %24:gr64 = COPY $rax
+    MOV64mr %3, 1, $noreg, 0, $noreg, %24
+    %22:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg
+    %19:gr64_nosp = MOV32ri64 16
+    %13:tile = PTILELOADDV %41:gr16, %40:gr16, %22, 1, killed %19, 0, $noreg
+    %14:gr64_nosp = MOV32ri64 64
+    %15:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    PTILESTOREDV %41:gr16, %40:gr16, killed %15, 1, killed %14, 0, $noreg, %13
+    %9:gr64_nosp = MOV32ri64 64
+    %10:gr64 = LEA64r %stack.0, 1, $noreg, 0, $noreg
+    %4:tile = PTILELOADDV %41:gr16, %40:gr16, killed %10, 1, killed %9, 0, $noreg
+    %5:gr64_nosp = MOV32ri64 16
+    %6:gr64 = LEA64r %stack.4, 1, $noreg, 0, $noreg
+    PTILESTOREDV %41:gr16, %40:gr16, killed %6, 1, killed %5, 0, $noreg, %4
+    RET64
+...
diff --git a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll
index 71f8f231747f..885bc805d655 100644
--- a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll
+++ b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s
 
 @buf = dso_local global [3072 x i8] zeroinitializer, align 64
 
diff --git a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
index 8f82bd2587ec..41208d6adb30 100644
--- a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+amx-tile,+amx-avx512,+avx10.2-512 | FileCheck %s
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+amx-tile,+amx-avx512,+avx10.2 | FileCheck %s
 
 define <16 x float> @test_tcvtrowd2ps(i32 %A) {
 ; CHECK-LABEL: test_tcvtrowd2ps:
diff --git a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll
index fd3925fabc51..dc8252ae7aca 100644
--- a/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll
+++ b/llvm/test/CodeGen/X86/amx-tile-avx512-internals.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx10.2-512, \
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx10.2, \
 ; RUN: -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s
 
 define void @test_amx(i8* %pointer, i8* %base, i32 %index, i64 %stride) {
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index e52ce6ca815b..b2651e91134e 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -229,3 +229,21 @@ entry:
   call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr %p, i32 1, <1 x i1> %1)
   ret void
 }
+
+define i64 @redundant_test(i64 %num, ptr %p1, i64 %in) {
+; CHECK-LABEL: redundant_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl $-32, %edi
+; CHECK-NEXT:    cfcmoveq (%rsi), %rax
+; CHECK-NEXT:    {nf} addq %rdx, %rax
+; CHECK-NEXT:    cmovneq %rdi, %rax
+; CHECK-NEXT:    retq
+  %and = and i64 %num, 4294967264
+  %cmp = icmp eq i64 %and, 0
+  %mask = bitcast i1 %cmp to <1 x i1>
+  %condload = tail call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %p1, i32 8, <1 x i1> %mask, <1 x i64> poison)
+  %v = bitcast <1 x i64> %condload to i64
+  %add = add i64 %v, %in
+  %sel = select i1 %cmp, i64 %add, i64 %num
+  ret i64 %sel
+}
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll
index ad24608d338a..d6d4db350910 100644
--- a/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-cfi-seh.ll
@@ -81,7 +81,7 @@ define i32 @csr6_alloc16(ptr %argv) {
 ; LIN-NEXT:    .cfi_def_cfa_offset 32
 ; LIN-NEXT:    pop2 %rbp, %r15
 ; LIN-NEXT:    .cfi_def_cfa_offset 16
-; LIN-NEXT:    popq %rcx
+; LIN-NEXT:    popq %rax
 ; LIN-NEXT:    .cfi_def_cfa_offset 8
 ; LIN-NEXT:    retq
 ;
@@ -116,7 +116,7 @@ define i32 @csr6_alloc16(ptr %argv) {
 ; LIN-PPX-NEXT:    .cfi_def_cfa_offset 32
 ; LIN-PPX-NEXT:    pop2p %rbp, %r15
 ; LIN-PPX-NEXT:    .cfi_def_cfa_offset 16
-; LIN-PPX-NEXT:    popq %rcx
+; LIN-PPX-NEXT:    popq %rax
 ; LIN-PPX-NEXT:    .cfi_def_cfa_offset 8
 ; LIN-PPX-NEXT:    retq
 ;
@@ -180,7 +180,7 @@ define i32 @csr6_alloc16(ptr %argv) {
 ; WIN-NEXT:    pop2 %rbp, %rbx
 ; WIN-NEXT:    pop2 %r13, %r12
 ; WIN-NEXT:    pop2 %r15, %r14
-; WIN-NEXT:    popq %rcx
+; WIN-NEXT:    popq %rax
 ; WIN-NEXT:    .seh_endepilogue
 ; WIN-NEXT:    retq
 ; WIN-NEXT:    .seh_endproc
@@ -211,7 +211,7 @@ define i32 @csr6_alloc16(ptr %argv) {
 ; WIN-PPX-NEXT:    pop2p %rbp, %rbx
 ; WIN-PPX-NEXT:    pop2p %r13, %r12
 ; WIN-PPX-NEXT:    pop2p %r15, %r14
-; WIN-PPX-NEXT:    popq %rcx
+; WIN-PPX-NEXT:    popq %rax
 ; WIN-PPX-NEXT:    .seh_endepilogue
 ; WIN-PPX-NEXT:    retq
 ; WIN-PPX-NEXT:    .seh_endproc
diff --git a/llvm/test/CodeGen/X86/avg-mask.ll b/llvm/test/CodeGen/X86/avg-mask.ll
index e8866393e8b6..b148cd3d42df 100644
--- a/llvm/test/CodeGen/X86/avg-mask.ll
+++ b/llvm/test/CodeGen/X86/avg-mask.ll
@@ -177,11 +177,11 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
 ; AVX512F-NEXT:    shrq $32, %rdi
 ; AVX512F-NEXT:    shrq $48, %rax
 ; AVX512F-NEXT:    shrl $16, %ecx
-; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpavgb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    kmovw %eax, %k3
 ; AVX512F-NEXT:    kmovw %edi, %k4
@@ -364,11 +364,11 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    shrl $16, %edi
-; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpavgw %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    kmovw %edi, %k2
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll b/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll
index ab8ac4fbd419..b43b1f7b9c32 100644
--- a/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll
+++ b/llvm/test/CodeGen/X86/avx10.2-fma-commute.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx10.2-512 | FileCheck %s
+; RUN: llc < %s --mtriple=x86_64-unknown-unknown -mattr=avx10.2 | FileCheck %s
 
 define <8 x bfloat> @fma_123_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y, <8 x bfloat> %z) {
 ; CHECK-LABEL: fma_123_v8bf16:
diff --git a/llvm/test/CodeGen/X86/avx10_2-cmp.ll b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
index 0f90f1a0a356..566ce533683f 100644
--- a/llvm/test/CodeGen/X86/avx10_2-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx10_2-cmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx10.2 | FileCheck %s --check-prefix=X86
 
 define i1 @hoeq(half %x, half %y) {
 ; X64-LABEL: hoeq:
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index c22a394e6c4e..79849a7153c9 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define <32 x bfloat> @test_int_x86_avx10_vaddbf16512(<32 x bfloat> %x1, <32 x bfloat> %x2) {
 ; CHECK-LABEL: test_int_x86_avx10_vaddbf16512:
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
index cbac76e9de27..9225bd88b089 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 declare <32 x bfloat> @llvm.x86.avx10.vminbf16512(<32 x bfloat>, <32 x bfloat>)
 
diff --git a/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll
index c4a904cc3bc4..cc87ae0aad1f 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define <32 x half> @test_int_x86_avx10_vcvt2ps2phx512(<16 x float> %A, <16 x float> %B) {
 ; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx512:
diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
index d7ad7b048c6d..c50da22193b2 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
 
 ; VCVTTPD2DQS
 define <8 x i32> @test_signed_v8i32_v8f64(<8 x double> %f) nounwind {
diff --git a/llvm/test/CodeGen/X86/avx10_2_512minmax-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512minmax-intrinsics.ll
index b7713128f472..c27ee1680dea 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512minmax-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512minmax-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=X86
 
 define <32 x bfloat> @test_int_x86_avx10_vminmaxbf16512(<32 x bfloat> %A, <32 x bfloat> %B) nounwind {
 ; X64-LABEL: test_int_x86_avx10_vminmaxbf16512:
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index b2e7caa15944..09eb53faaaad 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 ; VNNI FP16
 
diff --git a/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll
index 8430b2e1c028..2e69b41d282b 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512satcvt-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64 --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686 --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64 --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686 --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define dso_local <8 x i64> @test_mm512_ipcvtbf16_epi8(<32 x bfloat> noundef %__A) {
 ; CHECK-LABEL: test_mm512_ipcvtbf16_epi8:
diff --git a/llvm/test/CodeGen/X86/avx10_2_512satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512satcvtds-intrinsics.ll
index 652c35c77709..591349aabef4 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512satcvtds-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 
 define <8 x i32> @test_int_x86_mask_vcvtt_pd2dqs_512(<8 x double> %x0, <8 x i32> %src, i8 %mask) {
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 435f67a0f1e4..0f2c75b15d5b 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define <16 x bfloat> @test_int_x86_avx10_add_bf16_256(<16 x bfloat> %x1, <16 x bfloat> %x2) {
 ; CHECK-LABEL: test_int_x86_avx10_add_bf16_256:
@@ -1168,23 +1168,10 @@ entry:
 }
 
 define <32 x bfloat> @addv(<32 x bfloat> %a, <32 x bfloat> %b) nounwind {
-; X64-LABEL: addv:
-; X64:       # %bb.0:
-; X64-NEXT:    vaddbf16 %ymm2, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc2]
-; X64-NEXT:    vaddbf16 %ymm3, %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x58,0xcb]
-; X64-NEXT:    retq # encoding: [0xc3]
-;
-; X86-LABEL: addv:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
-; X86-NEXT:    andl $-32, %esp # encoding: [0x83,0xe4,0xe0]
-; X86-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
-; X86-NEXT:    vaddbf16 %ymm2, %ymm0, %ymm0 # encoding: [0x62,0xf5,0x7d,0x28,0x58,0xc2]
-; X86-NEXT:    vaddbf16 8(%ebp), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x58,0x8d,0x08,0x00,0x00,0x00]
-; X86-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
-; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    retl # encoding: [0xc3]
+; CHECK-LABEL: addv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddbf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf5,0x7d,0x48,0x58,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %add = fadd <32 x bfloat> %a, %b
   ret <32 x bfloat> %add
 }
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
index ba32b2adc799..3efc8cc3d129 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 declare <16 x bfloat> @llvm.x86.avx10.vminbf16256(<16 x bfloat>, <16 x bfloat>)
 
diff --git a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
index 90e2146cc2c0..04c93eb1ee6d 100644
--- a/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define <8 x half> @test_int_x86_avx10_vcvt2ps2phx128(<4 x float> %A, <4 x float> %B) {
 ; CHECK-LABEL: test_int_x86_avx10_vcvt2ps2phx128:
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index a2f167e94cc2..e0c2139b5e37 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
 
 ;
 ; 32-bit float to signed integer
diff --git a/llvm/test/CodeGen/X86/avx10_2minmax-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2minmax-intrinsics.ll
index 916d439ab77f..8ae5b670764e 100644
--- a/llvm/test/CodeGen/X86/avx10_2minmax-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2minmax-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=X86
 
 define <8 x bfloat> @test_int_x86_avx10_vminmaxbf16128(<8 x bfloat> %A, <8 x bfloat> %B) nounwind {
 ; X64-LABEL: test_int_x86_avx10_vminmaxbf16128:
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index ed5ae01448c5..0c5fd3bf9d24 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 ; VNNI FP16
 
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll
index 957523f87b7c..094637270503 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvt-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64 --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686 --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64 --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686 --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define dso_local <2 x i64> @test_mm_ipcvtbf16_epi8(<8 x bfloat> noundef %__A) {
 ; CHECK-LABEL: test_mm_ipcvtbf16_epi8:
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
index e9b739074b45..38d54cff6dc2 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,X86
 
 define i32 @test_x86_avx512_vcvttsd2usis(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_vcvttsd2usis:
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-x64-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-x64-intrinsics.ll
index f5be929bc85c..c853da5d2168 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-x64-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-x64-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s
 
 define i64 @test_x86_avx512_vcvttsd2si64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_vcvttsd2si64:
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
index 33819c9e0102..97ca0d88b7d4 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-arith.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,EVEX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,-evex512 | FileCheck %s --check-prefixes=CHECK,EVEX256
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK
 
 ; 256-bit
 
@@ -237,32 +236,19 @@ define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) {
 }
 
 define i16 @PR90356(<16 x i1> %a) {
-; EVEX512-LABEL: PR90356:
-; EVEX512:       # %bb.0:
-; EVEX512-NEXT:    vpsllw $7, %xmm0, %xmm0
-; EVEX512-NEXT:    vpmovb2m %xmm0, %k1
-; EVEX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; EVEX512-NEXT:    movb $63, %al
-; EVEX512-NEXT:    kmovd %eax, %k1
-; EVEX512-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
-; EVEX512-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; EVEX512-NEXT:    kmovd %k0, %eax
-; EVEX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; EVEX512-NEXT:    vzeroupper
-; EVEX512-NEXT:    retq
-;
-; EVEX256-LABEL: PR90356:
-; EVEX256:       # %bb.0:
-; EVEX256-NEXT:    vpsllw $7, %xmm0, %xmm0
-; EVEX256-NEXT:    vpmovb2m %xmm0, %k0
-; EVEX256-NEXT:    vpmovm2w %k0, %ymm0
-; EVEX256-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; EVEX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; EVEX256-NEXT:    vpmovw2m %ymm0, %k0
-; EVEX256-NEXT:    kmovd %k0, %eax
-; EVEX256-NEXT:    # kill: def $ax killed $ax killed $eax
-; EVEX256-NEXT:    vzeroupper
-; EVEX256-NEXT:    retq
+; CHECK-LABEL: PR90356:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; CHECK-NEXT:    movb $63, %al
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = shufflevector <16 x i1> %a, <16 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31>
   %2 = bitcast <16 x i1> %1 to i16
   ret i16 %2
diff --git a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
index e449c7192e4b..b60d7a5463d6 100644
--- a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
@@ -278,14 +278,14 @@ define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4
   ret <4 x float> %res
 }
 
-define <4 x float> @PR98306() {
+define <4 x float> @PR98306(i8 %m) {
 ; CHECK-LABEL: PR98306:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [7.8125E-3,1.050912E+6,4.203776E+6,1.6815616E+7]
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [3.2E+1,4.03288064E+8,8.0658432E+8,1.61318502E+9]
 ; CHECK-NEXT:    vfmaddcsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> <float 7.812500e-03, float 0x4130092000000000, float 0x4150094000000000, float 0x4170096000000000>, <4 x float> <float 2.000000e+00, float 0x4188098000000000, float 0x4198099000000000, float 0x41A809A000000000>, <4 x float> <float 3.200000e+01, float 0x41B809B000000000, float 0x41C809C000000000, float 0x41D809D000000000>, i8 0, i32 4)
+  %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> <float 7.812500e-03, float 0x4130092000000000, float 0x4150094000000000, float 0x4170096000000000>, <4 x float> <float 2.000000e+00, float 0x4188098000000000, float 0x4198099000000000, float 0x41A809A000000000>, <4 x float> <float 3.200000e+01, float 0x41B809B000000000, float 0x41C809C000000000, float 0x41D809D000000000>, i8 %m, i32 4)
   ret <4 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a2af7df44010..d09807e4a334 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX102
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefixes=AVX102
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx512f | FileCheck %s --check-prefixes=NOAVX512MOVZXC
 
 define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll
index 56d923d7c4cf..4a5c1fe5a2a0 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll
@@ -57,7 +57,7 @@ define <8 x half> @minsh(<8 x half> %va, ptr %pb) {
 ; CHECK-LABEL: minsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vminsh (%rdi), %xmm0, %xmm1
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %a = extractelement <8 x half> %va, i32 0
   %b = load half, ptr %pb
@@ -70,7 +70,7 @@ define <8 x half> @maxsh(<8 x half> %va, ptr %pb) {
 ; CHECK-LABEL: maxsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vminsh (%rdi), %xmm0, %xmm1
-; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %a = extractelement <8 x half> %va, i32 0
   %b = load half, ptr %pb
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 627a94799424..44ea3ce64ccf 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -1361,3 +1361,19 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind
   %res = shufflevector <16 x half> %a0, <16 x half> %a1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x half> %res
 }
+
+define <8 x half> @PR153570(ptr %p) {
+; CHECK-LABEL: PR153570:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vmulsh {rn-sae}, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
+; CHECK-NEXT:    vmovaps %xmm1, (%rdi)
+; CHECK-NEXT:    retq
+  %r = tail call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <8 x half> <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>, <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, i8 0, i32 8)
+  store <8 x half> %r, ptr %p, align 16
+  %r1 = tail call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <8 x half> <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>, <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, i8 1, i32 8)
+  ret <8 x half> %r1
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 526511c85045..316e3f27a0a1 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -303,7 +303,7 @@ define <8 x half> @test14(half %x) {
 ; X64-LABEL: test14:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test14:
@@ -318,7 +318,7 @@ define <16 x half> @test14b(half %x) {
 ; X64VL-LABEL: test14b:
 ; X64VL:       # %bb.0:
 ; X64VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64VL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: test14b:
@@ -329,7 +329,7 @@ define <16 x half> @test14b(half %x) {
 ; X64-NOVL-LABEL: test14b:
 ; X64-NOVL:       # %bb.0:
 ; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NOVL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; X64-NOVL-NEXT:    retq
@@ -341,7 +341,7 @@ define <32 x half> @test14c(half %x) {
 ; X64VL-LABEL: test14c:
 ; X64VL:       # %bb.0:
 ; X64VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64VL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: test14c:
@@ -352,7 +352,7 @@ define <32 x half> @test14c(half %x) {
 ; X64-NOVL-LABEL: test14c:
 ; X64-NOVL:       # %bb.0:
 ; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NOVL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NOVL-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64-NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64-NOVL-NEXT:    retq
@@ -1464,21 +1464,21 @@ define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
 ; X64VL-LABEL: movsh:
 ; X64VL:       # %bb.0:
 ; X64VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; X64VL-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64VL-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64VL-NEXT:    vaddph %xmm0, %xmm2, %xmm0
 ; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: movsh:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; X86-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X86-NEXT:    vaddph %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-NOVL-LABEL: movsh:
 ; X64-NOVL:       # %bb.0:
 ; X64-NOVL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
-; X64-NOVL-NEXT:    vmovsh %xmm0, %xmm1, %xmm3
+; X64-NOVL-NEXT:    vmovsh {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; X64-NOVL-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X64-NOVL-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; X64-NOVL-NEXT:    vaddsh %xmm4, %xmm5, %xmm4
@@ -2311,7 +2311,7 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind {
 ; X64-LABEL: test21:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT:    vmovsh %xmm2, %xmm3, %xmm2
+; X64-NEXT:    vmovsh {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
 ; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
 ; X64-NEXT:    retq
@@ -2427,7 +2427,7 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X64VL-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; X64VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; X64VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X64VL-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
+; X64VL-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; X64VL-NEXT:    retq
 ;
 ; X86-LABEL: pr52561:
@@ -2443,7 +2443,7 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
 ; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
@@ -2474,7 +2474,7 @@ define <8 x i16> @pr59628_xmm(i16 %arg) {
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vpbroadcastw %eax, %xmm1
-; X86-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
 ; X86-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
 ; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index 7613c9ff43e2..b8ebe2a4890a 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -2,18 +2,18 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x50,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -33,11 +33,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
 ; X64-NEXT:    vpdpbusd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x50,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -45,18 +45,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32>
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x50,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -76,12 +76,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
 ; X64-NEXT:    vpdpbusd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x50,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -90,18 +90,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32>
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x51,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -121,11 +121,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
 ; X64-NEXT:    vpdpbusds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x51,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -133,18 +133,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x51,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -164,12 +164,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
 ; X64-NEXT:    vpdpbusds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x51,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
index 62c4d39e8615..63ff88a7fa4a 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -2,20 +2,31 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusd_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
-; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpbusd_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -25,7 +36,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpbusd_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
@@ -41,20 +52,31 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
-; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpbusds_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -64,7 +86,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpbusds_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 21d0010ff630..60d0298e057f 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -2,18 +2,18 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
-declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpbusd_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x50,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -32,11 +32,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
 ; X64-NEXT:    vpdpbusd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x50,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <64 x i8>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -44,18 +44,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpbusds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x51,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -74,11 +74,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
 ; X64-NEXT:    vpdpbusds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x51,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <64 x i8>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..0f4a4f27b971
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpbusd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpbusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpbusds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpbusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index a1db6e54fa79..de8b2a41bf8c 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -4,9 +4,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
 
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
@@ -16,13 +16,13 @@ define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
@@ -32,13 +32,13 @@ define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
@@ -48,13 +48,13 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
@@ -64,7 +64,7 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
   ret <4 x i32> %res
 }
 
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index 8601d454215a..abdc296ae1e1 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2-256 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 
 define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
index 607720fbc3f3..0ddd0171a58a 100644
--- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8  --show-mc-encoding | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2-256  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
 
 
 declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 423f2c49e70e..474be4465d9b 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -654,3 +654,110 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
   %2 = sext <64 x i1> %1 to <64 x i8>
   ret <64 x i8> %2
 }
+
+define <8 x i32> @PR157382(ptr %p0, ptr %p1, ptr %p2) {
+; SSE2-SSSE3-LABEL: PR157382:
+; SSE2-SSSE3:       # %bb.0:
+; SSE2-SSSE3-NEXT:    movdqu (%rdi), %xmm3
+; SSE2-SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
+; SSE2-SSSE3-NEXT:    movdqu (%rsi), %xmm0
+; SSE2-SSSE3-NEXT:    movdqu 16(%rsi), %xmm4
+; SSE2-SSSE3-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSE2-SSSE3-NEXT:    pxor %xmm6, %xmm6
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
+; SSE2-SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSE2-SSSE3-NEXT:    pxor %xmm8, %xmm8
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm8
+; SSE2-SSSE3-NEXT:    pxor %xmm7, %xmm8
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-SSSE3-NEXT:    por %xmm6, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE2-SSSE3-NEXT:    por %xmm8, %xmm4
+; SSE2-SSSE3-NEXT:    packssdw %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT:    pcmpeqb %xmm5, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm7, %xmm1
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT:    pslld $31, %xmm1
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm1
+; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: PR157382:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqu (%rsi), %xmm6
+; AVX1-NEXT:    vmovdqu 16(%rsi), %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm6, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm7, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT:    vpmovsxwd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR157382:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqu (%rsi), %ymm1
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpgtd %ymm0, %ymm3, %ymm4
+; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT:    vpxor %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR157382:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpcmpnltd %ymm2, %ymm0, %k0
+; AVX512-NEXT:    vpcmpltd (%rsi), %ymm2, %k1
+; AVX512-NEXT:    vptestmb %xmm1, %xmm1, %k2
+; AVX512-NEXT:    korw %k1, %k0, %k0
+; AVX512-NEXT:    korw %k2, %k0, %k1
+; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT:    retq
+  %ld0 = load <8 x i32>, ptr %p0, align 1
+  %ld1 = load <8 x i32>, ptr %p1, align 1
+  %ld2 = load <8 x i8>, ptr %p2, align 1
+  %cmp0 = icmp sge <8 x i32> %ld0, zeroinitializer
+  %cmp1 = icmp sgt <8 x i32> %ld1, zeroinitializer
+  %cmp2 = icmp ne <8 x i8> %ld2, zeroinitializer
+  %cmp01 = or <8 x i1> %cmp0, %cmp1
+  %cmp012 = or <8 x i1> %cmp01, %cmp2
+  %res = select <8 x i1> %cmp012, <8 x i32> %ld0, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/bswap-inline-asm.ll b/llvm/test/CodeGen/X86/bswap-inline-asm.ll
index f8f154c0688f..a9ce616b7ecc 100644
--- a/llvm/test/CodeGen/X86/bswap-inline-asm.ll
+++ b/llvm/test/CodeGen/X86/bswap-inline-asm.ll
@@ -1,88 +1,150 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix CHK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 
-; CHK-NOT: InlineAsm
+; bswap inline assembly should be preserved as-is.
 
-; CHECK-LABEL: foo:
-; CHECK: bswapq
 define i64 @foo(i64 %x) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapq %rax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
 	%asmtmp = tail call i64 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i64 %x) nounwind
 	ret i64 %asmtmp
 }
 
-; CHECK-LABEL: bar:
-; CHECK: bswapq
 define i64 @bar(i64 %x) nounwind {
+; CHECK-LABEL: bar:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapq %rax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
 	%asmtmp = tail call i64 asm "bswapq ${0:q}", "=r,0,~{dirflag},~{fpsr},~{flags}"(i64 %x) nounwind
 	ret i64 %asmtmp
 }
 
-; CHECK-LABEL: pen:
-; CHECK: bswapl
 define i32 @pen(i32 %x) nounwind {
-	%asmtmp = tail call i32 asm "bswapl ${0:q}", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %x) nounwind
+; CHECK-LABEL: pen:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
+	%asmtmp = tail call i32 asm "bswapl ${0:k}", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %x) nounwind
 	ret i32 %asmtmp
 }
 
-; CHECK-LABEL: s16:
-; CHECK: rolw    $8,
 define zeroext i16 @s16(i16 zeroext %x) nounwind {
+; CHECK-LABEL: s16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    rorw $8, %di
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    retq
   %asmtmp = tail call i16 asm "rorw $$8, ${0:w}", "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i16 %x) nounwind
   ret i16 %asmtmp
 }
 
-; CHECK-LABEL: t16:
-; CHECK: rolw    $8,
 define zeroext i16 @t16(i16 zeroext %x) nounwind {
+; CHECK-LABEL: t16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    rorw $8, %di
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    retq
   %asmtmp = tail call i16 asm "rorw $$8, ${0:w}", "=r,0,~{cc},~{dirflag},~{fpsr},~{flags}"(i16 %x) nounwind
   ret i16 %asmtmp
 }
 
-; CHECK-LABEL: u16:
-; CHECK: rolw    $8,
 define zeroext i16 @u16(i16 zeroext %x) nounwind {
+; CHECK-LABEL: u16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    rolw $8, %di
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    retq
   %asmtmp = tail call i16 asm "rolw $$8, ${0:w}", "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i16 %x) nounwind
   ret i16 %asmtmp
 }
 
-; CHECK-LABEL: v16:
-; CHECK: rolw    $8,
 define zeroext i16 @v16(i16 zeroext %x) nounwind {
+; CHECK-LABEL: v16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    rolw $8, %di
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    movzwl %di, %eax
+; CHECK-NEXT:    retq
   %asmtmp = tail call i16 asm "rolw $$8, ${0:w}", "=r,0,~{cc},~{dirflag},~{fpsr},~{flags}"(i16 %x) nounwind
   ret i16 %asmtmp
 }
 
-; CHECK-LABEL: s32:
-; CHECK: bswapl
 define i32 @s32(i32 %x) nounwind {
+; CHECK-LABEL: s32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
   %asmtmp = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %x) nounwind
   ret i32 %asmtmp
 }
 
-; CHECK-LABEL: t32:
-; CHECK: bswapl
 define i32 @t32(i32 %x) nounwind {
+; CHECK-LABEL: t32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
   %asmtmp = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{flags},~{fpsr}"(i32 %x) nounwind
   ret i32 %asmtmp
 }
 
-; CHECK-LABEL: u32:
-; CHECK: bswapl
 define i32 @u32(i32 %x) nounwind {
+; CHECK-LABEL: u32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    rorw $8, %ax
+; CHECK-NEXT:    rorl $16, %eax
+; CHECK-NEXT:    rorw $8, %ax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
   %asmtmp = tail call i32 asm "rorw $$8, ${0:w};rorl $$16, $0;rorw $$8, ${0:w}", "=r,0,~{cc},~{dirflag},~{flags},~{fpsr}"(i32 %x) nounwind
   ret i32 %asmtmp
 }
 
-; CHECK-LABEL: s64:
-; CHECK: bswapq
 define i64 @s64(i64 %x) nounwind {
+; CHECK-LABEL: s64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapq %rax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
   %asmtmp = tail call i64 asm "bswap ${0:q}", "=r,0,~{dirflag},~{fpsr},~{flags}"(i64 %x) nounwind
   ret i64 %asmtmp
 }
 
-; CHECK-LABEL: t64:
-; CHECK: bswapq
 define i64 @t64(i64 %x) nounwind {
+; CHECK-LABEL: t64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    ## InlineAsm Start
+; CHECK-NEXT:    bswapq %rax
+; CHECK-NEXT:    ## InlineAsm End
+; CHECK-NEXT:    retq
   %asmtmp = tail call i64 asm "bswap ${0:q}", "=r,0,~{fpsr},~{dirflag},~{flags}"(i64 %x) nounwind
   ret i64 %asmtmp
 }
diff --git a/llvm/test/CodeGen/X86/call-graph-section.ll b/llvm/test/CodeGen/X86/call-graph-section.ll
index 4a9840eac489..66d009cf1221 100644
--- a/llvm/test/CodeGen/X86/call-graph-section.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section.ll
@@ -11,14 +11,12 @@ declare !type !2 ptr @baz(ptr)
 
 define void @main() {
 entry:
-  %a = alloca i8, align 1
   %fp_foo_val = load ptr, ptr null, align 8
   call void (...) %fp_foo_val(), !callee_type !1
   %fp_bar_val = load ptr, ptr null, align 8
-  %param = trunc i64 0 to i8
-  %call_fp_bar = call i32 %fp_bar_val(i8 signext %param), !callee_type !3
+  %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !3
   %fp_baz_val = load ptr, ptr null, align 8
-  %call_fp_baz = call ptr %fp_baz_val(ptr %a), !callee_type !4
+  %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/combine-gfni.ll b/llvm/test/CodeGen/X86/combine-gfni.ll
new file mode 100644
index 000000000000..b105cdf7ea89
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-gfni.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+gfni,+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+gfni,+avx512bw | FileCheck %s --check-prefixes=AVX512
+
+define <16 x i8> @gf2p8affineqb_freeze(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+; SSE-LABEL: gf2p8affineqb_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE-NEXT:    gf2p8affineqb $11, %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gf2p8affineqb_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineqb $11, %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: gf2p8affineqb_freeze:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovb2m %xmm2, %k1
+; AVX512-NEXT:    vgf2p8affineqb $11, %xmm1, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+  %i = icmp slt <16 x i8> %a2, zeroinitializer
+  %g = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %a1, <16 x i8> %a1, i8 11)
+  %f = freeze <16 x i8> %g
+  %r = select <16 x i1> %i, <16 x i8> %f, <16 x i8> %a0
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @gf2p8affineinvqb_freeze(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+; SSE-LABEL: gf2p8affineinvqb_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE-NEXT:    gf2p8affineinvqb $11, %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gf2p8affineinvqb_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8affineinvqb $11, %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: gf2p8affineinvqb_freeze:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovb2m %xmm2, %k1
+; AVX512-NEXT:    vgf2p8affineinvqb $11, %xmm1, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+  %i = icmp slt <16 x i8> %a2, zeroinitializer
+  %g = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %a1, <16 x i8> %a1, i8 11)
+  %f = freeze <16 x i8> %g
+  %r = select <16 x i1> %i, <16 x i8> %f, <16 x i8> %a0
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @gf2p8mulb_freeze(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+; SSE-LABEL: gf2p8mulb_freeze:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpgtb %xmm2, %xmm3
+; SSE-NEXT:    gf2p8mulb %xmm1, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: gf2p8mulb_freeze:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vgf2p8mulb %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: gf2p8mulb_freeze:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovb2m %xmm2, %k1
+; AVX512-NEXT:    vgf2p8mulb %xmm1, %xmm1, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+  %i = icmp slt <16 x i8> %a2, zeroinitializer
+  %g = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %a1, <16 x i8> %a1)
+  %f = freeze <16 x i8> %g
+  %r = select <16 x i1> %i, <16 x i8> %f, <16 x i8> %a0
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
diff --git a/llvm/test/CodeGen/X86/combine-vpmadd52.ll b/llvm/test/CodeGen/X86/combine-vpmadd52.ll
new file mode 100644
index 000000000000..2cb060ea92b1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-vpmadd52.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxifma | FileCheck %s --check-prefixes=CHECK,AVX
+
+define <2 x i64> @test1_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test1_vpmadd52l:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test1_vpmadd52l:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test2_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test2_vpmadd52l:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test2_vpmadd52l:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x2, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %and)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test3_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test3_vpmadd52l:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test3_vpmadd52l:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %or = or <2 x i64> %x2, splat (i64 4503599627370496) ; 1LL << 52
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %or)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_wrong_bits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52l_wrong_bits:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
+; AVX512-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm2
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_wrong_bits:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x1, splat (i64 2251799813685247) ; (1LL << 51) - 1
+  %or = or <2 x i64> %x2, splat (i64 2251799813685248) ; 1LL << 51
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %or)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_wrong_op(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52l_wrong_op:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0
+; AVX512-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_wrong_op:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX-NEXT:    {vex} vpmadd52luq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %and, <2 x i64> %x1, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52h:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52h:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52huq %xmm2, %xmm1, %xmm0
+; AVX-NEXT:    retq
+
+  %and = and <2 x i64> %x1, splat (i64 4503599627370495) ; (1LL << 52) - 1
+  %or = or <2 x i64> %x2, splat (i64 4503599627370496) ; 1LL << 52
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %and, <2 x i64> %or)
+  ret <2 x i64> %1
+}
+
+; Test the fold x * 0 + y -> y
+define <2 x i64> @test_vpmadd52l_mul_zero(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 0>, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h_mul_zero(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52h_mul_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 0>, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_commuted(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> <i64 0, i64 0>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_both(<2 x i64> %x0) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero_both:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 0>, <2 x i64> <i64 0, i64 0>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_in_52bits(<2 x i64> %x0, <2 x i64> %x1) {
+; CHECK-LABEL: test_vpmadd52l_mul_zero_in_52bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+
+  ; mul by (1 << 52)
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> splat (i64 4503599627370496), <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_add_zero(<2 x i64> %x0, <2 x i64> %x1) {
+; AVX512-LABEL: test_vpmadd52l_add_zero:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_add_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> <i64 0, i64 0>, <2 x i64> %x0, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_zero_scalar(<2 x i64> %x0, <2 x i64> %x1) {
+; AVX512-LABEL: test_vpmadd52l_mul_zero_scalar:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmadd52luq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_mul_zero_scalar:
+; AVX:       # %bb.0:
+; AVX-NEXT:    {vex} vpmadd52luq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX-NEXT:    retq
+
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> <i64 0, i64 123>, <2 x i64> %x1)
+  ret <2 x i64> %1
+}
+
+; (1 << 51) * (1 << 1) -> 1 << 52 -> low 52 bits are zeroes
+define <2 x i64> @test_vpmadd52l_mul_lo52_zero(<2 x i64> %x0) {
+; CHECK-LABEL: test_vpmadd52l_mul_lo52_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> splat (i64 2251799813685248), <2 x i64> splat (i64 2))
+  ret <2 x i64> %1
+}
+
+; (1 << 25) * (1 << 26) = 1 << 51 -> high 52 bits are zeroes
+define <2 x i64> @test_vpmadd52h_mul_hi52_zero(<2 x i64> %x0) {
+; CHECK-LABEL: test_vpmadd52h_mul_hi52_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> splat (i64 33554432), <2 x i64> splat (i64 67108864))
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_lo52_const(<2 x i64> %x0) {
+; AVX512-LABEL: test_vpmadd52l_mul_lo52_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_mul_lo52_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> splat (i64 123), <2 x i64> splat (i64 456))
+  ret <2 x i64> %1
+}
+
+; (1 << 51) * (1 << 51) -> 1 << 102 -> the high 52 bits is 1 << 50
+define <2 x i64> @test_vpmadd52h_mul_hi52_const(<2 x i64> %x0) {
+; AVX512-LABEL: test_vpmadd52h_mul_hi52_const:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52h_mul_hi52_const:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> splat (i64 2251799813685248), <2 x i64> splat (i64 2251799813685248))
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_lo52_mask(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test_vpmadd52l_mul_lo52_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 1073741824) ; 1LL << 30
+  %and2 = and <2 x i64> %x1, splat (i64 1073741824) ; 1LL << 30
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and1, <2 x i64> %and2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h_mul_hi52_mask(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test_vpmadd52h_mul_hi52_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+  %and1 = lshr <2 x i64> %x0, splat (i64 40)
+  %and2 = lshr <2 x i64> %x1, splat (i64 40)
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %and1, <2 x i64> %and2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52l_mul_lo52_mask_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52l_mul_lo52_mask_negative:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm2
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
+; AVX512-NEXT:    vpmadd52luq %xmm1, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52l_mul_lo52_mask_negative:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    {vex} vpmadd52luq %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 2097152) ; 1LL << 21
+  %and2 = and <2 x i64> %x1, splat (i64 1073741824) ; 1LL << 30
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %and1, <2 x i64> %and2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpmadd52h_mul_hi52_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test_vpmadd52h_mul_hi52_negative:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $30, %xmm0, %xmm2
+; AVX512-NEXT:    vpsrlq $43, %xmm1, %xmm1
+; AVX512-NEXT:    vpmadd52huq %xmm1, %xmm2, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test_vpmadd52h_mul_hi52_negative:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $30, %xmm0, %xmm2
+; AVX-NEXT:    vpsrlq $43, %xmm1, %xmm1
+; AVX-NEXT:    {vex} vpmadd52huq %xmm1, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %and1 = lshr <2 x i64> %x0, splat (i64 30)
+  %and2 = lshr <2 x i64> %x1, splat (i64 43)
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %and1, <2 x i64> %and2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test1_knownbits_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test1_knownbits_vpmadd52l:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [1,1]
+; CHECK-NEXT:    # xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 4)
+  %and2 = and <2 x i64> %x1, splat (i64 4)
+  %madd = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> splat(i64 1), <2 x i64> %and1, <2 x i64> %and2)
+  %ret = and <2 x i64> %madd, splat (i64 1)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @test1_knownbits_vpmadd52h(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test1_knownbits_vpmadd52h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [3,3]
+; CHECK-NEXT:    # xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 1073741824) ; 1LL << 30
+  %and2 = and <2 x i64> %x1, splat (i64 1073741824) ; 1LL << 30
+  %madd = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> splat(i64 3), <2 x i64> %and1, <2 x i64> %and2)
+  %ret = and <2 x i64> %madd, splat (i64 3)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @test2_knownbits_vpmadd52l(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test2_knownbits_vpmadd52l:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [1234,1234]
+; CHECK-NEXT:    # xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 67108864) ; 1LL << 26
+  %and2 = and <2 x i64> %x1, splat (i64 33554432) ; 1LL << 25
+  %madd = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> splat(i64 1234), <2 x i64> %and1, <2 x i64> %and2)
+  %ret = and <2 x i64> %madd, splat (i64 1234)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @test2_knownbits_vpmadd52h(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; CHECK-LABEL: test2_knownbits_vpmadd52h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [1,1]
+; CHECK-NEXT:    # xmm0 = mem[0,0]
+; CHECK-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 1073741824) ; 1LL << 30
+  %and2 = and <2 x i64> %x1, splat (i64 1073741824) ; 1LL << 30
+  ; add (1LL << 20) + 1
+  %madd = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> splat(i64 1025), <2 x i64> %and1, <2 x i64> %and2)
+  %ret = and <2 x i64> %madd, splat (i64 1)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @test3_knownbits_vpmadd52l_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test3_knownbits_vpmadd52l_negative:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,1]
+; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm3
+; AVX512-NEXT:    vpmadd52luq %xmm1, %xmm0, %xmm3
+; AVX512-NEXT:    vpand %xmm2, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test3_knownbits_vpmadd52l_negative:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,1]
+; AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa %xmm2, %xmm3
+; AVX-NEXT:    {vex} vpmadd52luq %xmm1, %xmm0, %xmm3
+; AVX-NEXT:    vpand %xmm2, %xmm3, %xmm0
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 67108865) ; (1LL << 26) + 1
+  %or = or <2 x i64> %x1, splat (i64 1)
+  %madd = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> splat(i64 1), <2 x i64> %and1, <2 x i64> %or)
+  %ret = and <2 x i64> %madd, splat (i64 1)
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @test3_knownbits_vpmadd52h_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+; AVX512-LABEL: test3_knownbits_vpmadd52h_negative:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,1]
+; AVX512-NEXT:    vmovdqa %xmm2, %xmm3
+; AVX512-NEXT:    vpmadd52huq %xmm1, %xmm0, %xmm3
+; AVX512-NEXT:    vpand %xmm2, %xmm3, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX-LABEL: test3_knownbits_vpmadd52h_negative:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [1,1]
+; AVX-NEXT:    vmovdqa %xmm2, %xmm3
+; AVX-NEXT:    {vex} vpmadd52huq %xmm1, %xmm0, %xmm3
+; AVX-NEXT:    vpand %xmm2, %xmm3, %xmm0
+; AVX-NEXT:    retq
+  %and1 = and <2 x i64> %x0, splat (i64 4194304) ; 1LL << 22
+  %and2 = and <2 x i64> %x1, splat (i64 1073741824) ; 1LL << 30
+  ; add (1LL << 20) + 1
+  %madd = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> splat(i64 1), <2 x i64> %and1, <2 x i64> %and2)
+  %ret = and <2 x i64> %madd, splat (i64 1)
+  ret <2 x i64> %ret
+}
diff --git a/llvm/test/CodeGen/X86/comi-flags.ll b/llvm/test/CodeGen/X86/comi-flags.ll
index 6f520aa57dcd..805b1b54d5b6 100644
--- a/llvm/test/CodeGen/X86/comi-flags.ll
+++ b/llvm/test/CodeGen/X86/comi-flags.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx  | FileCheck %s --check-prefixes=AVX,NO-AVX10_2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX,AVX10_2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx10.2 | FileCheck %s --check-prefixes=AVX,AVX10_2
 
 ;
 ; SSE
diff --git a/llvm/test/CodeGen/X86/evex512-mem.ll b/llvm/test/CodeGen/X86/evex512-mem.ll
deleted file mode 100644
index 85bb3b3a5487..000000000000
--- a/llvm/test/CodeGen/X86/evex512-mem.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512bw,avx512vl,-evex512 < %s | FileCheck %s --check-prefix=AVX256
-
-define void @test1() {
-; AVX512-LABEL: test1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 64, %rax
-; AVX512-NEXT:    movq %rax, (%rax)
-; AVX512-NEXT:    vmovups 0, %zmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-;
-; AVX256-LABEL: test1:
-; AVX256:       # %bb.0:
-; AVX256-NEXT:    movq 64, %rax
-; AVX256-NEXT:    movq %rax, (%rax)
-; AVX256-NEXT:    vmovups 0, %ymm0
-; AVX256-NEXT:    vmovups 32, %ymm1
-; AVX256-NEXT:    vmovups %ymm1, (%rax)
-; AVX256-NEXT:    vmovups %ymm0, (%rax)
-; AVX256-NEXT:    vzeroupper
-; AVX256-NEXT:    retq
-  call void @llvm.memcpy.p0.p0.i64(ptr align 8 poison, ptr align 8 null, i64 72, i1 false)
-  ret void
-}
-
-declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
new file mode 100644
index 000000000000..a155d125a6d1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
@@ -0,0 +1,252 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
+
+; expand-fp must also run with optnone
+
+; Function Attrs: noinline optnone
+define double @main(i224 %0) #0 {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entryitofp-entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    subq $88, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 144
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    orq %rdx, %rax
+; CHECK-NEXT:    movl %ecx, %r8d
+; CHECK-NEXT:    movq %rsi, %r9
+; CHECK-NEXT:    orq %r8, %r9
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    orq %r9, %rax
+; CHECK-NEXT:    je .LBB0_10
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_1: # %itofp-if-end
+; CHECK-NEXT:    movslq %ecx, %rax
+; CHECK-NEXT:    movq %rax, %r9
+; CHECK-NEXT:    sarq $31, %r9
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    xorq %rax, %rcx
+; CHECK-NEXT:    xorq %rax, %rdx
+; CHECK-NEXT:    xorq %rax, %rsi
+; CHECK-NEXT:    xorq %r9, %rdi
+; CHECK-NEXT:    subq %r9, %rdi
+; CHECK-NEXT:    sbbq %rax, %rsi
+; CHECK-NEXT:    sbbq %rax, %rdx
+; CHECK-NEXT:    sbbq %rax, %rcx
+; CHECK-NEXT:    movq %rcx, %r8
+; CHECK-NEXT:    shldq $32, %rdx, %r8
+; CHECK-NEXT:    bsrq %r8, %rax
+; CHECK-NEXT:    xorl $63, %eax
+; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    shldq $32, %rsi, %r10
+; CHECK-NEXT:    bsrq %r10, %r11
+; CHECK-NEXT:    xorl $63, %r11d
+; CHECK-NEXT:    orl $64, %r11d
+; CHECK-NEXT:    testq %r8, %r8
+; CHECK-NEXT:    cmovnel %eax, %r11d
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    shldq $32, %rdi, %rbx
+; CHECK-NEXT:    bsrq %rbx, %r14
+; CHECK-NEXT:    xorl $63, %r14d
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shlq $32, %rax
+; CHECK-NEXT:    bsrq %rax, %rax
+; CHECK-NEXT:    xorl $63, %eax
+; CHECK-NEXT:    orl $64, %eax
+; CHECK-NEXT:    testq %rbx, %rbx
+; CHECK-NEXT:    cmovnel %r14d, %eax
+; CHECK-NEXT:    subl $-128, %eax
+; CHECK-NEXT:    orq %r8, %r10
+; CHECK-NEXT:    cmovnel %r11d, %eax
+; CHECK-NEXT:    movl $224, %r11d
+; CHECK-NEXT:    subl %eax, %r11d
+; CHECK-NEXT:    movl $223, %r10d
+; CHECK-NEXT:    subl %eax, %r10d
+; CHECK-NEXT:    cmpl $53, %r11d
+; CHECK-NEXT:    jle .LBB0_8
+; CHECK-NEXT:  # %bb.2: # %itofp-if-then4
+; CHECK-NEXT:    movl %r11d, %r8d
+; CHECK-NEXT:    subl $54, %r8d
+; CHECK-NEXT:    je .LBB0_4
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_3: # %itofp-if-then4
+; CHECK-NEXT:    movl %r11d, %r8d
+; CHECK-NEXT:    subl $55, %r8d
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.11:
+; CHECK-NEXT:    jmp .LBB0_6
+; CHECK-NEXT:  .LBB0_4: # %itofp-sw-bb
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    shldq $1, %rdi, %rax
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    shldq $1, %rsi, %r8
+; CHECK-NEXT:    shldq $1, %rdx, %rcx
+; CHECK-NEXT:    addq %rdi, %rdi
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    movq %r8, %rdx
+; CHECK-NEXT:    jmp .LBB0_6
+; CHECK-NEXT:  .LBB0_5: # %itofp-sw-default
+; CHECK-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl %ecx, %r8d
+; CHECK-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $-87, %r8b
+; CHECK-NEXT:    subb %al, %r8b
+; CHECK-NEXT:    movb %r8b, %bl
+; CHECK-NEXT:    shrb $6, %bl
+; CHECK-NEXT:    movzbl %bl, %r12d
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, (%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq -24(%rsp,%r12,8), %rbx
+; CHECK-NEXT:    movq -32(%rsp,%r12,8), %r13
+; CHECK-NEXT:    movq %rcx, %rbp
+; CHECK-NEXT:    movb %r8b, %cl
+; CHECK-NEXT:    movq %r13, %r14
+; CHECK-NEXT:    shrdq %cl, %rbx, %r14
+; CHECK-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq -48(%rsp,%r12,8), %r15
+; CHECK-NEXT:    movq -40(%rsp,%r12,8), %r12
+; CHECK-NEXT:    movb %r8b, %cl
+; CHECK-NEXT:    movq %r12, %r14
+; CHECK-NEXT:    shrdq %cl, %r13, %r14
+; CHECK-NEXT:    movb %r8b, %cl
+; CHECK-NEXT:    shrq %cl, %rbx
+; CHECK-NEXT:    movb %r8b, %cl
+; CHECK-NEXT:    shrdq %cl, %r12, %r15
+; CHECK-NEXT:    addb $55, %al
+; CHECK-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rbp, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    shrb $3, %cl
+; CHECK-NEXT:    andb $24, %cl
+; CHECK-NEXT:    negb %cl
+; CHECK-NEXT:    movsbq %cl, %rdx
+; CHECK-NEXT:    movq -80(%rsp,%rdx), %rsi
+; CHECK-NEXT:    movq -72(%rsp,%rdx), %rdi
+; CHECK-NEXT:    movq -64(%rsp,%rdx), %r8
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    movq %r8, %r12
+; CHECK-NEXT:    shldq %cl, %rdi, %r12
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    movq %rsi, %r13
+; CHECK-NEXT:    shlq %cl, %r13
+; CHECK-NEXT:    orq %r12, %r13
+; CHECK-NEXT:    movq -56(%rsp,%rdx), %rdx
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    shldq %cl, %r8, %rdx
+; CHECK-NEXT:    movl %edx, %edx
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi
+; CHECK-NEXT:    orq %rdx, %rdi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    orq %rdi, %r13
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    orq %rax, %r15
+; CHECK-NEXT:    movq %r15, %rdi
+; CHECK-NEXT:    movq %r14, %rsi
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-NEXT:    movq %rbx, %rcx
+; CHECK-NEXT:    jmp .LBB0_6
+; CHECK-NEXT:  .LBB0_6: # %itofp-sw-epilog
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $2, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    orq %rax, %rdi
+; CHECK-NEXT:    addq $1, %rdi
+; CHECK-NEXT:    adcq $0, %rsi
+; CHECK-NEXT:    adcq $0, %rdx
+; CHECK-NEXT:    adcq $0, %rcx
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    shldq $62, %rdi, %rdx
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    btq $55, %rdi
+; CHECK-NEXT:    jae .LBB0_9
+; CHECK-NEXT:    jmp .LBB0_7
+; CHECK-NEXT:  .LBB0_7: # %itofp-if-then20
+; CHECK-NEXT:    shldq $61, %rdi, %rsi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    movl %r11d, %r10d
+; CHECK-NEXT:    jmp .LBB0_9
+; CHECK-NEXT:  .LBB0_8: # %itofp-if-else
+; CHECK-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    addb $85, %al
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    shrb $3, %cl
+; CHECK-NEXT:    andb $24, %cl
+; CHECK-NEXT:    negb %cl
+; CHECK-NEXT:    movsbq %cl, %rcx
+; CHECK-NEXT:    movq 48(%rsp,%rcx), %rdx
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    shlq %cl, %rdx
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:  .LBB0_9: # %itofp-if-end26
+; CHECK-NEXT:    andl $-2147483648, %r9d # imm = 0x80000000
+; CHECK-NEXT:    shll $20, %r10d
+; CHECK-NEXT:    addl $1072693248, %r10d # imm = 0x3FF00000
+; CHECK-NEXT:    andl $1048575, %eax # imm = 0xFFFFF
+; CHECK-NEXT:    orl %r9d, %eax
+; CHECK-NEXT:    orl %r10d, %eax
+; CHECK-NEXT:    movl %eax, %eax
+; CHECK-NEXT:    shlq $32, %rax
+; CHECK-NEXT:    movabsq $4294967295, %rcx # imm = 0xFFFFFFFF
+; CHECK-NEXT:    andq %rcx, %rdx
+; CHECK-NEXT:    orq %rdx, %rax
+; CHECK-NEXT:    movq %rax, %xmm0
+; CHECK-NEXT:  .LBB0_10: # %itofp-return
+; CHECK-NEXT:    addq $88, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 56
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %x = sitofp i224 %0 to double
+  ret double %x
+}
+
+attributes #0 = { noinline optnone }
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 989aabc9e87b..864c2336f37c 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2  | FileCheck %s --check-prefixes=AVX10_2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
 
 declare float @llvm.maximum.f32(float, float)
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index eef87b5a9f85..54d82b0c1c92 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2  | FileCheck %s --check-prefixes=AVX10_2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
 
 declare float @llvm.maximumnum.f32(float, float)
diff --git a/llvm/test/CodeGen/X86/fp16-reload.mir b/llvm/test/CodeGen/X86/fp16-reload.mir
new file mode 100644
index 000000000000..ddbd48cbf3ee
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-reload.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown -start-before=twoaddressinstruction -stop-after=postrapseudos -verify-machineinstrs -o - %s | FileCheck %s
+
+...
+---
+name:            test
+alignment:       16
+tracksRegLiveness: true
+debugInstrRef:   true
+registers:
+liveins:
+  - { reg: '$xmm0', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $xmm0
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $xmm0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: MOVSSmr $rsp, 1, $noreg, -4, $noreg, $xmm0 :: (store (s32) into %stack.0, align 2)
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    ; CHECK-NEXT: renamable $xmm0 = MOVSSrm $rsp, 1, $noreg, -4, $noreg :: (load (s32) from %stack.0, align 2)
+    ; CHECK-NEXT: FNOP implicit-def $fpsw, implicit killed renamable $xmm0
+    ; CHECK-NEXT: RET 0
+    %0:fr16 = COPY killed $xmm0
+    INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $xmm0, 12 /* clobber */, implicit-def dead early-clobber $xmm1, 12 /* clobber */, implicit-def dead early-clobber $xmm2, 12 /* clobber */, implicit-def dead early-clobber $xmm3, 12 /* clobber */, implicit-def dead early-clobber $xmm4, 12 /* clobber */, implicit-def dead early-clobber $xmm5, 12 /* clobber */, implicit-def dead early-clobber $xmm6, 12 /* clobber */, implicit-def dead early-clobber $xmm7, 12 /* clobber */, implicit-def dead early-clobber $xmm8, 12 /* clobber */, implicit-def dead early-clobber $xmm9, 12 /* clobber */, implicit-def dead early-clobber $xmm10, 12 /* clobber */, implicit-def dead early-clobber $xmm11, 12 /* clobber */, implicit-def dead early-clobber $xmm12, 12 /* clobber */, implicit-def dead early-clobber $xmm13, 12 /* clobber */, implicit-def dead early-clobber $xmm14, 12 /* clobber */, implicit-def dead early-clobber $xmm15, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    FNOP implicit-def $fpsw, implicit %0:fr16
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/fp16-spill.ll b/llvm/test/CodeGen/X86/fp16-spill.ll
new file mode 100644
index 000000000000..6161009b6f56
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp16-spill.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512
+
+define half @test(float %f, ptr %p) nounwind {
+; SSE2-LABEL: test:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $16, %rsp
+; SSE2-NEXT:    movq %rdi, %rbx
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    #APP
+; SSE2-NEXT:    #NO_APP
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss %xmm0, (%rbx)
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    addq $16, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $16, %rsp
+; AVX-NEXT:    movq %rdi, %rbx
+; AVX-NEXT:    callq __truncsfhf2@PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    callq __extendhfsf2@PLT
+; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX-NEXT:    #APP
+; AVX-NEXT:    #NO_APP
+; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss %xmm0, (%rbx)
+; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    addq $16, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    #APP
+; AVX512-NEXT:    #NO_APP
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovss %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    retq
+  %t = fptrunc float %f to half
+  %t2 = fpext half %t to float
+  tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
+  store float %t2, ptr %p
+  ret half %t
+}
diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll
index c79e19f07cda..77eaaa1ca08d 100644
--- a/llvm/test/CodeGen/X86/fpenv.ll
+++ b/llvm/test/CodeGen/X86/fpenv.ll
@@ -11,244 +11,6 @@ declare i32 @llvm.get.fpmode.i32()
 declare void @llvm.set.fpmode.i32(i32 %fpmode)
 declare void @llvm.reset.fpmode()
 
-define void @func_01() nounwind {
-; X86-NOSSE-LABEL: func_01:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    fnstcw (%esp)
-; X86-NOSSE-NEXT:    orb $12, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldcw (%esp)
-; X86-NOSSE-NEXT:    popl %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE-LABEL: func_01:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    fnstcw (%esp)
-; X86-SSE-NEXT:    orb $12, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fldcw (%esp)
-; X86-SSE-NEXT:    stmxcsr (%esp)
-; X86-SSE-NEXT:    orb $96, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    ldmxcsr (%esp)
-; X86-SSE-NEXT:    popl %eax
-; X86-SSE-NEXT:    retl
-;
-; X64-LABEL: func_01:
-; X64:       # %bb.0:
-; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    orb $12, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    orb $96, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    retq
-  call void @llvm.set.rounding(i32 0)  ; TowardZero (CW[11-10] = 11)
-  ret void
-}
-
-define void @func_02() nounwind {
-; X86-NOSSE-LABEL: func_02:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    fnstcw (%esp)
-; X86-NOSSE-NEXT:    andb $-13, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    fldcw (%esp)
-; X86-NOSSE-NEXT:    popl %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE-LABEL: func_02:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    fnstcw (%esp)
-; X86-SSE-NEXT:    andb $-13, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    fldcw (%esp)
-; X86-SSE-NEXT:    stmxcsr (%esp)
-; X86-SSE-NEXT:    andb $-97, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    ldmxcsr (%esp)
-; X86-SSE-NEXT:    popl %eax
-; X86-SSE-NEXT:    retl
-;
-; X64-LABEL: func_02:
-; X64:       # %bb.0:
-; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andb $-13, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    andb $-97, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    retq
-  call void @llvm.set.rounding(i32 1)  ; ToNearestTiesToEven (CW[11-10] = 00)
-  ret void
-}
-
-define void @func_03() nounwind {
-; X86-NOSSE-LABEL: func_03:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    fnstcw (%esp)
-; X86-NOSSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
-; X86-NOSSE-NEXT:    andl (%esp), %eax
-; X86-NOSSE-NEXT:    orl $2048, %eax # imm = 0x800
-; X86-NOSSE-NEXT:    movw %ax, (%esp)
-; X86-NOSSE-NEXT:    fldcw (%esp)
-; X86-NOSSE-NEXT:    popl %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE-LABEL: func_03:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    fnstcw (%esp)
-; X86-SSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
-; X86-SSE-NEXT:    andl (%esp), %eax
-; X86-SSE-NEXT:    orl $2048, %eax # imm = 0x800
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    fldcw (%esp)
-; X86-SSE-NEXT:    stmxcsr (%esp)
-; X86-SSE-NEXT:    movl $-24577, %eax # imm = 0x9FFF
-; X86-SSE-NEXT:    andl (%esp), %eax
-; X86-SSE-NEXT:    orl $16384, %eax # imm = 0x4000
-; X86-SSE-NEXT:    movl %eax, (%esp)
-; X86-SSE-NEXT:    ldmxcsr (%esp)
-; X86-SSE-NEXT:    popl %eax
-; X86-SSE-NEXT:    retl
-;
-; X64-LABEL: func_03:
-; X64:       # %bb.0:
-; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl $-3073, %eax # imm = 0xF3FF
-; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    orl $2048, %eax # imm = 0x800
-; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl $-24577, %eax # imm = 0x9FFF
-; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    orl $16384, %eax # imm = 0x4000
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    retq
-  call void @llvm.set.rounding(i32 2)  ; Upward (CW[11-10] = 10)
-  ret void
-}
-
-define void @func_04() nounwind {
-; X86-NOSSE-LABEL: func_04:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    fnstcw (%esp)
-; X86-NOSSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
-; X86-NOSSE-NEXT:    andl (%esp), %eax
-; X86-NOSSE-NEXT:    orl $1024, %eax # imm = 0x400
-; X86-NOSSE-NEXT:    movw %ax, (%esp)
-; X86-NOSSE-NEXT:    fldcw (%esp)
-; X86-NOSSE-NEXT:    popl %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE-LABEL: func_04:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    fnstcw (%esp)
-; X86-SSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
-; X86-SSE-NEXT:    andl (%esp), %eax
-; X86-SSE-NEXT:    orl $1024, %eax # imm = 0x400
-; X86-SSE-NEXT:    movw %ax, (%esp)
-; X86-SSE-NEXT:    fldcw (%esp)
-; X86-SSE-NEXT:    stmxcsr (%esp)
-; X86-SSE-NEXT:    movl $-24577, %eax # imm = 0x9FFF
-; X86-SSE-NEXT:    andl (%esp), %eax
-; X86-SSE-NEXT:    orl $8192, %eax # imm = 0x2000
-; X86-SSE-NEXT:    movl %eax, (%esp)
-; X86-SSE-NEXT:    ldmxcsr (%esp)
-; X86-SSE-NEXT:    popl %eax
-; X86-SSE-NEXT:    retl
-;
-; X64-LABEL: func_04:
-; X64:       # %bb.0:
-; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl $-3073, %eax # imm = 0xF3FF
-; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    orl $1024, %eax # imm = 0x400
-; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl $-24577, %eax # imm = 0x9FFF
-; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    orl $8192, %eax # imm = 0x2000
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    retq
-  call void @llvm.set.rounding(i32 3)  ; Downward (CW[11-10] = 01)
-  ret void
-}
-
-define void @func_05(i32 %x) nounwind {
-; X86-NOSSE-LABEL: func_05:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    leal 4(%eax,%eax), %ecx
-; X86-NOSSE-NEXT:    movl $201, %eax
-; X86-NOSSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOSSE-NEXT:    shll %cl, %eax
-; X86-NOSSE-NEXT:    andl $3072, %eax # imm = 0xC00
-; X86-NOSSE-NEXT:    fnstcw (%esp)
-; X86-NOSSE-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
-; X86-NOSSE-NEXT:    andl (%esp), %ecx
-; X86-NOSSE-NEXT:    orl %eax, %ecx
-; X86-NOSSE-NEXT:    movw %cx, (%esp)
-; X86-NOSSE-NEXT:    fldcw (%esp)
-; X86-NOSSE-NEXT:    popl %eax
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE-LABEL: func_05:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    leal 4(%eax,%eax), %ecx
-; X86-SSE-NEXT:    movl $201, %eax
-; X86-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE-NEXT:    shll %cl, %eax
-; X86-SSE-NEXT:    andl $3072, %eax # imm = 0xC00
-; X86-SSE-NEXT:    fnstcw (%esp)
-; X86-SSE-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
-; X86-SSE-NEXT:    andl (%esp), %ecx
-; X86-SSE-NEXT:    orl %eax, %ecx
-; X86-SSE-NEXT:    movw %cx, (%esp)
-; X86-SSE-NEXT:    fldcw (%esp)
-; X86-SSE-NEXT:    stmxcsr (%esp)
-; X86-SSE-NEXT:    movl $-24577, %ecx # imm = 0x9FFF
-; X86-SSE-NEXT:    andl (%esp), %ecx
-; X86-SSE-NEXT:    leal (%ecx,%eax,8), %eax
-; X86-SSE-NEXT:    movl %eax, (%esp)
-; X86-SSE-NEXT:    ldmxcsr (%esp)
-; X86-SSE-NEXT:    popl %eax
-; X86-SSE-NEXT:    retl
-;
-; X64-LABEL: func_05:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal 4(%rdi,%rdi), %ecx
-; X64-NEXT:    movl $201, %eax
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shll %cl, %eax
-; X64-NEXT:    andl $3072, %eax # imm = 0xC00
-; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
-; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl $-24577, %ecx # imm = 0x9FFF
-; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    leal (%rcx,%rax,8), %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
-; X64-NEXT:    retq
-  call void @llvm.set.rounding(i32 %x)  ; Downward
-  ret void
-}
-
 define void @get_fpenv_01(ptr %ptr) #0 {
 ; X86-NOSSE-LABEL: get_fpenv_01:
 ; X86-NOSSE:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 3196f8177cc9..38e3e23f7caa 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -141,3 +141,48 @@ entry:
   %z = urem i32 %y, 10
   ret i32 %z
 }
+
+; Make sure we don't crash when replacing all uses of N with an existing freeze N.
+
+define i64 @pr155345(ptr %p1, i1 %cond, ptr %p2, ptr %p3) {
+; X86ASM-LABEL: pr155345:
+; X86ASM:       # %bb.0: # %entry
+; X86ASM-NEXT:    movzbl (%rdi), %edi
+; X86ASM-NEXT:    xorl %eax, %eax
+; X86ASM-NEXT:    orb $1, %dil
+; X86ASM-NEXT:    movb %dil, (%rdx)
+; X86ASM-NEXT:    movzbl %dil, %edx
+; X86ASM-NEXT:    cmovel %edx, %eax
+; X86ASM-NEXT:    sete %dil
+; X86ASM-NEXT:    testb $1, %sil
+; X86ASM-NEXT:    cmovnel %edx, %eax
+; X86ASM-NEXT:    movb %dl, (%rcx)
+; X86ASM-NEXT:    movl $1, %edx
+; X86ASM-NEXT:    movl %eax, %ecx
+; X86ASM-NEXT:    shlq %cl, %rdx
+; X86ASM-NEXT:    orb %sil, %dil
+; X86ASM-NEXT:    movzbl %dil, %eax
+; X86ASM-NEXT:    andl %edx, %eax
+; X86ASM-NEXT:    andl $1, %eax
+; X86ASM-NEXT:    retq
+entry:
+  %load1 = load i8, ptr %p1, align 1
+  %v1 = or i8 %load1, 1
+  %v2 = zext i8 %v1 to i32
+  store i8 %v1, ptr %p2, align 1
+  %v3 = load i8, ptr %p2, align 1
+  %ext1 = sext i8 %v3 to i64
+  %ext2 = zext i32 %v2 to i64
+  %cmp1 = icmp ult i64 0, %ext1
+  %v4 = select i1 %cond, i1 false, i1 %cmp1
+  %sel1 = select i1 %v4, i64 0, i64 %ext2
+  %shl = shl i64 1, %sel1
+  store i8 %v1, ptr %p3, align 1
+  %v5 = load i8, ptr %p3, align 1
+  %ext3 = sext i8 %v5 to i64
+  %cmp2 = icmp ult i64 0, %ext3
+  %v6 = select i1 %cond, i1 false, i1 %cmp2
+  %sel2 = select i1 %v6, i64 0, i64 1
+  %and = and i64 %sel2, %shl
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
new file mode 100644
index 000000000000..aebfc7d483d6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -0,0 +1,580 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
+
+; 67108863 == (1 << 26) - 1
+; 4503599627370496 == (1 << 52)
+; 4503599627370495 == (1 << 52) - 1
+
+define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm0
+; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVX-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_combine:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <8 x i64> %x, splat (i64 67108863)
+  %y_masked = and <8 x i64> %y, splat (i64 67108863)
+  %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <8 x i64> %mul, %z
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_combine_v2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
+; AVX-NEXT:    vpand %ymm7, %ymm0, %ymm0
+; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVX-NEXT:    vpand %ymm7, %ymm1, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVX-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX-NEXT:    vmovdqa %ymm5, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_combine_v2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
+  %y_masked = and <8 x i64> %y, splat (i64 3)
+  %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <8 x i64> %mul, %z
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_no_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm7
+; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm8
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm9
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm6
+; AVX-NEXT:    vpsrlq $32, %ymm8, %ymm8
+; AVX-NEXT:    vpmuludq %ymm3, %ymm8, %ymm8
+; AVX-NEXT:    vpsrlq $32, %ymm6, %ymm6
+; AVX-NEXT:    vpmuludq %ymm6, %ymm1, %ymm6
+; AVX-NEXT:    vpaddq %ymm6, %ymm8, %ymm6
+; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vpsrlq $32, %ymm7, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVX-NEXT:    vpsrlq $32, %ymm9, %ymm7
+; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_no_combine:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVX512-NEXT:    vpandq %zmm3, %zmm0, %zmm4
+; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm3
+; AVX512-NEXT:    vpsrlq $32, %zmm4, %zmm4
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm4, %zmm4
+; AVX512-NEXT:    vpsrlq $32, %zmm3, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
+  %y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
+  %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <8 x i64> %mul, %z
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_512_no_combine_v2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm6
+; AVX-NEXT:    vpmuludq %ymm3, %ymm6, %ymm6
+; AVX-NEXT:    vpsrlq $32, %ymm3, %ymm7
+; AVX-NEXT:    vpmuludq %ymm7, %ymm1, %ymm7
+; AVX-NEXT:    vpaddq %ymm6, %ymm7, %ymm6
+; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVX-NEXT:    vpsrlq $32, %ymm2, %ymm7
+; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_512_no_combine_v2:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm3, %zmm3
+; AVX512-NEXT:    vpsrlq $32, %zmm1, %zmm4
+; AVX512-NEXT:    vpmuludq %zmm4, %zmm0, %zmm4
+; AVX512-NEXT:    vpaddq %zmm3, %zmm4, %zmm3
+; AVX512-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %mul = mul <8 x i64> %x, %y
+  %res = add <8 x i64> %mul, %z
+  ret <8 x i64> %res
+}
+
+define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+; AVX-LABEL: test_256_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-NOVL-LABEL: test_256_combine:
+; AVX512-NOVL:       # %bb.0:
+; AVX512-NOVL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX512-NOVL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512-NOVL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NOVL-NEXT:    retq
+;
+; AVX512VL-LABEL: test_256_combine:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX512VL-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+  %x_masked = and <4 x i64> %x, splat(i64 67108863)
+  %y_masked = and <4 x i64> %y, splat(i64 67108863)
+  %mul = mul nuw nsw <4 x i64> %x_masked, %y_masked
+  %res = add nuw nsw <4 x i64> %z, %mul
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_256_no_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+; X64-LABEL: test_256_no_combine:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; X64-NEXT:    vpmuludq %ymm1, %ymm3, %ymm3
+; X64-NEXT:    vpsrlq $32, %ymm1, %ymm4
+; X64-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
+; X64-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; X64-NEXT:    vpsllq $32, %ymm3, %ymm3
+; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; X64-NEXT:    retq
+  %mul = mul <4 x i64> %x, %y
+  %res = add <4 x i64> %mul, %z
+  ret <4 x i64> %res
+}
+
+define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
+; AVX-LABEL: test_128_combine:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX512-NOVL-LABEL: test_128_combine:
+; AVX512-NOVL:       # %bb.0:
+; AVX512-NOVL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX512-NOVL-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX512-NOVL-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX512-NOVL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
+; AVX512-NOVL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512-NOVL-NEXT:    retq
+;
+; AVX512VL-LABEL: test_128_combine:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmadd52luq %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512VL-NEXT:    retq
+  %x_masked = and <2 x i64> %x, splat (i64 67108863)
+  %y_masked = and <2 x i64> %y, splat (i64 67108863)
+  %mul = mul <2 x i64> %x_masked, %y_masked
+  %res = add <2 x i64> %z, %mul
+  ret <2 x i64> %res
+}
+
+; Sanity check we're not applying this here
+define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
+; X64-LABEL: test_scalar_no_ifma:
+; X64:       # %bb.0:
+; X64-NEXT:    imulq %rsi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdx), %rax
+; X64-NEXT:    retq
+  %mul = mul <1 x i64> %x, %y
+  %res = add <1 x i64> %mul, %z
+  ret <1 x i64> %res
+}
+
+; 40-bit and 13-bit, too wide
+define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; AVX-LABEL: test_mixed_width_too_wide:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
+; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
+; AVX-NEXT:    vpshufb %ymm6, %ymm1, %ymm7
+; AVX-NEXT:    vpmuludq %ymm3, %ymm7, %ymm7
+; AVX-NEXT:    vpsllq $32, %ymm7, %ymm7
+; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm1, %ymm5, %ymm1
+; AVX-NEXT:    vpaddq %ymm7, %ymm1, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_mixed_width_too_wide:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm3
+; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; AVX512-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %x40 = and <8 x i64> %x, splat (i64 1099511627775)
+  %y13 = and <8 x i64> %y, splat (i64 8191)
+  %mul = mul <8 x i64> %x40, %y13
+  %res = add <8 x i64> %z, %mul
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
+; AVX-LABEL: test_zext32_inputs_not_safe:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpmuludq %ymm5, %ymm4, %ymm4
+; AVX-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpaddq %ymm4, %ymm2, %ymm0
+; AVX-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zext32_inputs_not_safe:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %x = zext <8 x i32> %xi32 to <8 x i64>
+  %y = zext <8 x i32> %yi32 to <8 x i64>
+  %mul = mul <8 x i64> %x, %y
+  %res = add <8 x i64> %z, %mul
+  ret <8 x i64> %res
+}
+
+define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind {
+; AVX-LABEL: test_1024_combine_split:
+; AVX:       # %bb.0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    movq %rsp, %rbp
+; AVX-NEXT:    andq $-32, %rsp
+; AVX-NEXT:    subq $32, %rsp
+; AVX-NEXT:    vmovdqa 112(%rbp), %ymm8
+; AVX-NEXT:    vmovdqa 80(%rbp), %ymm9
+; AVX-NEXT:    vmovdqa 48(%rbp), %ymm10
+; AVX-NEXT:    vmovdqa 16(%rbp), %ymm11
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm3, %ymm12, %ymm3
+; AVX-NEXT:    vpand %ymm2, %ymm12, %ymm2
+; AVX-NEXT:    vpand %ymm1, %ymm12, %ymm1
+; AVX-NEXT:    vpand %ymm0, %ymm12, %ymm0
+; AVX-NEXT:    vpand %ymm7, %ymm12, %ymm7
+; AVX-NEXT:    {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
+; AVX-NEXT:    vpand %ymm6, %ymm12, %ymm3
+; AVX-NEXT:    {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
+; AVX-NEXT:    vpand %ymm5, %ymm12, %ymm2
+; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
+; AVX-NEXT:    vpand %ymm4, %ymm12, %ymm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
+; AVX-NEXT:    vmovdqa %ymm11, %ymm0
+; AVX-NEXT:    vmovdqa %ymm10, %ymm1
+; AVX-NEXT:    vmovdqa %ymm9, %ymm2
+; AVX-NEXT:    vmovdqa %ymm8, %ymm3
+; AVX-NEXT:    movq %rbp, %rsp
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_1024_combine_split:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NEXT:    vpandq %zmm6, %zmm2, %zmm2
+; AVX512-NEXT:    vpandq %zmm6, %zmm0, %zmm0
+; AVX512-NEXT:    vpmadd52luq %zmm2, %zmm0, %zmm4
+; AVX512-NEXT:    vpandq %zmm6, %zmm3, %zmm0
+; AVX512-NEXT:    vpandq %zmm6, %zmm1, %zmm1
+; AVX512-NEXT:    vpmadd52luq %zmm0, %zmm1, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
+; AVX512-NEXT:    retq
+  %x_masked = and <16 x i64> %x, splat (i64 67108863)
+  %y_masked = and <16 x i64> %y, splat (i64 67108863)
+  %mul = mul <16 x i64> %x_masked, %y_masked
+  %res = add <16 x i64> %z, %mul
+  ret <16 x i64> %res
+}
+
+define <1 x i64> @test_not_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
+; X64-LABEL: test_not_v1i64:
+; X64:       # %bb.0:
+; X64-NEXT:    andl $67108863, %edi # imm = 0x3FFFFFF
+; X64-NEXT:    imulq %rdi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdx), %rax
+; X64-NEXT:    retq
+  %x_masked = and <1 x i64> %x, splat (i64 67108863)
+  %y_masked = and <1 x i64> %x, splat (i64 67108863)
+  %mul = mul <1 x i64> %x_masked, %y_masked
+  %res = add <1 x i64> %mul, %z
+  ret <1 x i64> %res
+}
+
+define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
+; AVX-LABEL: test_v3i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX512-NOVL-LABEL: test_v3i64:
+; AVX512-NOVL:       # %bb.0:
+; AVX512-NOVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
+; AVX512-NOVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512-NOVL-NEXT:    retq
+;
+; AVX512VL-LABEL: test_v3i64:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %x_masked = and <3 x i64> %x, splat (i64 67108863)
+  %y_masked = and <3 x i64> %x, splat (i64 67108863)
+  %mul = mul <3 x i64> %x_masked, %y_masked
+  %res = add <3 x i64> %mul, %z
+  ret <3 x i64> %res
+}
+
+define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
+; AVX-LABEL: test_v5i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vmovq %rsi, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX-NEXT:    vmovq %rcx, %xmm3
+; AVX-NEXT:    vmovq %r9, %xmm4
+; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVX-NEXT:    vpsllq $33, %xmm4, %xmm4
+; AVX-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
+; AVX-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVX-NEXT:    vmovq %xmm1, 32(%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v5i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <5 x i64> %x, splat (i64 67108863)
+  %y_masked = and <5 x i64> %x, splat (i64 67108863)
+  %mul = mul <5 x i64> %x_masked, %y_masked
+  %res = add <5 x i64> %mul, %z
+  ret <5 x i64> %res
+}
+
+define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
+; AVX-LABEL: test_v6i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vmovq %rsi, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
+; AVX-NEXT:    vmovq %r9, %xmm0
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 32(%rdi)
+; AVX-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v6i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+  %x_masked = and <6 x i64> %x, splat (i64 67108863)
+  %y_masked = and <6 x i64> %x, splat (i64 67108863)
+  %mul = mul <6 x i64> %x_masked, %y_masked
+  %res = add <6 x i64> %mul, %z
+  ret <6 x i64> %res
+}
+
+define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
+; AVX-LABEL: test_v9i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    vmovq %r8, %xmm0
+; AVX-NEXT:    vmovq %rcx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovq %rdx, %xmm1
+; AVX-NEXT:    vmovq %rsi, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovq %r9, %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
+; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm4
+; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
+; AVX-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX-NEXT:    vmovq %rcx, %xmm5
+; AVX-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
+; AVX-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVX-NEXT:    vpsrlq $32, %xmm5, %xmm6
+; AVX-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
+; AVX-NEXT:    vpsllq $33, %xmm6, %xmm6
+; AVX-NEXT:    vpmuludq %xmm5, %xmm5, %xmm5
+; AVX-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
+; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
+; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
+; AVX-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVX-NEXT:    vmovdqa %ymm4, (%rdi)
+; AVX-NEXT:    vmovq %xmm2, 64(%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_v9i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    vmovq %r8, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vmovq %r9, %xmm1
+; AVX512-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm2
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX512-NEXT:    vmovq %rcx, %xmm3
+; AVX512-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX512-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVX512-NEXT:    vpsllq $33, %xmm4, %xmm4
+; AVX512-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX512-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVX512-NEXT:    vpmadd52luq %zmm0, %zmm0, %zmm2
+; AVX512-NEXT:    vmovq %xmm1, 64(%rdi)
+; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x_masked = and <9 x i64> %x, splat (i64 67108863)
+  %y_masked = and <9 x i64> %x, splat (i64 67108863)
+  %mul = mul <9 x i64> %x_masked, %y_masked
+  %res = add <9 x i64> %mul, %z
+  ret <9 x i64> %res
+}
diff --git a/llvm/test/CodeGen/X86/inline-asm-flag-clobber.ll b/llvm/test/CodeGen/X86/inline-asm-flag-clobber.ll
index 57dccfc1b4a8..0538541a6f7b 100644
--- a/llvm/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -18,9 +18,9 @@ define i64 @t(ptr %arg) nounwind {
         ret i64 0
 }
 
-; Make sure that we translate this to the bswap intrinsic which lowers down without the
-; inline assembly.
-; CHECK-NOT: #APP
+; Make sure this lowers to inline assembly and is not translated to an
+; intrinsic.
+; CHECK: #APP
 define i32 @s(i32 %argc, ptr nocapture %argv) unnamed_addr nounwind {
 entry:
   %0 = trunc i32 %argc to i16
diff --git a/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll b/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll
index 3ac0fd7746a3..eccb32346a40 100644
--- a/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll
+++ b/llvm/test/CodeGen/X86/ins_subreg_coalesce-3.ll
@@ -22,41 +22,45 @@ define void @FontChange(i1 %foo) nounwind {
 ; CHECK-LABEL: FontChange:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    je .LBB0_10
+; CHECK-NEXT:    je .LBB0_12
+; CHECK-NEXT:  # %bb.1: # %bb298
+; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %bb304
+; CHECK-NEXT:    je .LBB0_4
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_1: # %bb366
+; CHECK-NEXT:  .LBB0_3: # %bb366
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %bb428
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_4: # %bb428
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    je .LBB0_10
-; CHECK-NEXT:  # %bb.3:
+; CHECK-NEXT:    je .LBB0_12
+; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    cmpb $0, 0
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_4: # %bb650
+; CHECK-NEXT:  .LBB0_6: # %bb650
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    je .LBB0_4
-; CHECK-NEXT:  # %bb.5: # %bb662
+; CHECK-NEXT:    je .LBB0_6
+; CHECK-NEXT:  # %bb.7: # %bb662
 ; CHECK-NEXT:    movl 0, %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    andl $57344, %ecx # imm = 0xE000
 ; CHECK-NEXT:    cmpl $8192, %ecx # imm = 0x2000
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.6: # %bb4884
+; CHECK-NEXT:    jne .LBB0_12
+; CHECK-NEXT:  # %bb.8: # %bb4884
 ; CHECK-NEXT:    andl $7168, %eax # imm = 0x1C00
 ; CHECK-NEXT:    cmpl $1024, %eax # imm = 0x400
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.7: # %bb4932
+; CHECK-NEXT:    jne .LBB0_12
+; CHECK-NEXT:  # %bb.9: # %bb4932
 ; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  # %bb.8: # %bb4940
+; CHECK-NEXT:    jne .LBB0_12
+; CHECK-NEXT:  # %bb.10: # %bb4940
 ; CHECK-NEXT:    movl 0, %eax
 ; CHECK-NEXT:    cmpl $160, %eax
-; CHECK-NEXT:    je .LBB0_10
-; CHECK-NEXT:  # %bb.9: # %bb4940
+; CHECK-NEXT:    je .LBB0_12
+; CHECK-NEXT:  # %bb.11: # %bb4940
 ; CHECK-NEXT:    cmpl $159, %eax
-; CHECK-NEXT:  .LBB0_10: # %bb4897
+; CHECK-NEXT:  .LBB0_12: # %bb4897
 ; CHECK-NEXT:    retq
 entry:
 	br i1 %foo, label %bb298, label %bb49
diff --git a/llvm/test/CodeGen/X86/isel-ceil.ll b/llvm/test/CodeGen/X86/isel-ceil.ll
new file mode 100644
index 000000000000..c82cfebd4814
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-ceil.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define float @ceil_f32(float %a) nounwind readnone {
+; DAG-X64-LABEL: ceil_f32:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    jmp ceilf@PLT # TAILCALL
+;
+; FASTISEL-X64-LABEL: ceil_f32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    pushq %rax
+; FASTISEL-X64-NEXT:    callq ceilf@PLT
+; FASTISEL-X64-NEXT:    popq %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: ceil_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll ceilf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: ceil_f32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp ceilf@PLT # TAILCALL
+  %c = call float @llvm.ceil.f32(float %a)
+  ret float %c
+}
+
+define double @ceil_f64(double %a) nounwind readnone {
+; DAG-X64-LABEL: ceil_f64:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    jmp ceil@PLT # TAILCALL
+;
+; FASTISEL-X64-LABEL: ceil_f64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    pushq %rax
+; FASTISEL-X64-NEXT:    callq ceil@PLT
+; FASTISEL-X64-NEXT:    popq %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: ceil_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll ceil
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: ceil_f64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp ceil@PLT # TAILCALL
+  %c = call double @llvm.ceil.f64(double %a)
+  ret double %c
+}
+
+define x86_fp80 @ceil_f80(x86_fp80 %a) nounwind readnone {
+; X64-LABEL: ceil_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq ceill@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+;
+; X86-LABEL: ceil_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll ceill
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: ceil_f80:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $24, %rsp
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fstpt (%rsp)
+; GISEL-X64-NEXT:    callq ceill@PLT
+; GISEL-X64-NEXT:    addq $24, %rsp
+; GISEL-X64-NEXT:    retq
+  %c = call x86_fp80 @llvm.ceil.f80(x86_fp80 %a)
+  ret x86_fp80 %c
+}
+
diff --git a/llvm/test/CodeGen/X86/isel-floor.ll b/llvm/test/CodeGen/X86/isel-floor.ll
new file mode 100644
index 000000000000..675925b61126
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-floor.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define float @floor_f32(float %a) nounwind readnone {
+; DAG-X64-LABEL: floor_f32:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    jmp floorf@PLT # TAILCALL
+;
+; FASTISEL-X64-LABEL: floor_f32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    pushq %rax
+; FASTISEL-X64-NEXT:    callq floorf@PLT
+; FASTISEL-X64-NEXT:    popq %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: floor_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll floorf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: floor_f32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp floorf@PLT # TAILCALL
+  %c = call float @llvm.floor.f32(float %a)
+  ret float %c
+}
+
+define double @floor_f64(double %a) nounwind readnone {
+; DAG-X64-LABEL: floor_f64:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    jmp floor@PLT # TAILCALL
+;
+; FASTISEL-X64-LABEL: floor_f64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    pushq %rax
+; FASTISEL-X64-NEXT:    callq floor@PLT
+; FASTISEL-X64-NEXT:    popq %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: floor_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll floor
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: floor_f64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp floor@PLT # TAILCALL
+  %c = call double @llvm.floor.f64(double %a)
+  ret double %c
+}
+
+define x86_fp80 @floor_f80(x86_fp80 %a) nounwind readnone {
+; X64-LABEL: floor_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq floorl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+;
+; X86-LABEL: floor_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll floorl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: floor_f80:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $24, %rsp
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fstpt (%rsp)
+; GISEL-X64-NEXT:    callq floorl@PLT
+; GISEL-X64-NEXT:    addq $24, %rsp
+; GISEL-X64-NEXT:    retq
+  %c = call x86_fp80 @llvm.floor.f80(x86_fp80 %a)
+  ret x86_fp80 %c
+}
+
diff --git a/llvm/test/CodeGen/X86/isel-ftrunc.ll b/llvm/test/CodeGen/X86/isel-ftrunc.ll
new file mode 100644
index 000000000000..9bf06193961a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-ftrunc.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+
+define float @trunc_f32(float %a) nounwind readnone {
+; DAG-X64-LABEL: trunc_f32:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    jmp truncf@PLT # TAILCALL
+;
+; FASTISEL-X64-LABEL: trunc_f32:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    pushq %rax
+; FASTISEL-X64-NEXT:    callq truncf@PLT
+; FASTISEL-X64-NEXT:    popq %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: trunc_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fstps (%esp)
+; X86-NEXT:    calll truncf
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: trunc_f32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp truncf@PLT # TAILCALL
+  %c = call float @llvm.trunc.f32(float %a)
+  ret float %c
+}
+
+define double @trunc_f64(double %a) nounwind readnone {
+; DAG-X64-LABEL: trunc_f64:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    jmp trunc@PLT # TAILCALL
+;
+; FASTISEL-X64-LABEL: trunc_f64:
+; FASTISEL-X64:       # %bb.0:
+; FASTISEL-X64-NEXT:    pushq %rax
+; FASTISEL-X64-NEXT:    callq trunc@PLT
+; FASTISEL-X64-NEXT:    popq %rax
+; FASTISEL-X64-NEXT:    retq
+;
+; X86-LABEL: trunc_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll trunc
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: trunc_f64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    jmp trunc@PLT # TAILCALL
+  %c = call double @llvm.trunc.f64(double %a)
+  ret double %c
+}
+
+define x86_fp80 @trunc_f80(x86_fp80   %a) nounwind readnone {
+; X64-LABEL: trunc_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq truncl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+;
+; X86-LABEL: trunc_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll truncl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: trunc_f80:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    subq $24, %rsp
+; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; GISEL-X64-NEXT:    fstpt (%rsp)
+; GISEL-X64-NEXT:    callq truncl@PLT
+; GISEL-X64-NEXT:    addq $24, %rsp
+; GISEL-X64-NEXT:    retq
+  %c = call x86_fp80   @llvm.trunc.f80(x86_fp80   %a)
+  ret x86_fp80   %c
+}
+
diff --git a/llvm/test/CodeGen/X86/llvm.acos.ll b/llvm/test/CodeGen/X86/isel-llvm.acos.ll
index 9176cf47bda7..9176cf47bda7 100644
--- a/llvm/test/CodeGen/X86/llvm.acos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.acos.ll
diff --git a/llvm/test/CodeGen/X86/llvm.asin.ll b/llvm/test/CodeGen/X86/isel-llvm.asin.ll
index 87ffcc9c963c..87ffcc9c963c 100644
--- a/llvm/test/CodeGen/X86/llvm.asin.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.asin.ll
diff --git a/llvm/test/CodeGen/X86/llvm.atan.ll b/llvm/test/CodeGen/X86/isel-llvm.atan.ll
index c03361d18c1d..c03361d18c1d 100644
--- a/llvm/test/CodeGen/X86/llvm.atan.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.atan.ll
diff --git a/llvm/test/CodeGen/X86/llvm.atan2.ll b/llvm/test/CodeGen/X86/isel-llvm.atan2.ll
index aa56068e1778..aa56068e1778 100644
--- a/llvm/test/CodeGen/X86/llvm.atan2.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.atan2.ll
diff --git a/llvm/test/CodeGen/X86/llvm.cos.ll b/llvm/test/CodeGen/X86/isel-llvm.cos.ll
index af039854d349..af039854d349 100644
--- a/llvm/test/CodeGen/X86/llvm.cos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.cos.ll
diff --git a/llvm/test/CodeGen/X86/llvm.cosh.ll b/llvm/test/CodeGen/X86/isel-llvm.cosh.ll
index a61867c11fd4..a61867c11fd4 100644
--- a/llvm/test/CodeGen/X86/llvm.cosh.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.cosh.ll
diff --git a/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll b/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll
new file mode 100644
index 000000000000..688add1e92ab
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s --check-prefixes=X86-NOSSE,SDAG-X86-NOSSE
+; RUN: llc < %s -mtriple=i686-- -fast-isel -fast-isel-abort=1 -mattr=-sse | FileCheck %s --check-prefixes=X86-NOSSE,FASTISEL-X86-NOSSE
+; RUN: llc < %s -mtriple=i686-- -global-isel -global-isel-abort=2 -mattr=-sse | FileCheck %s --check-prefixes=X86-NOSSE,GISEL-X86-NOSSE
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse | FileCheck %s --check-prefixes=X64-NOSSE,SDAG-X64-NOSSE
+; RUN: llc < %s -mtriple=x86_64-- -fast-isel -fast-isel-abort=1 -mattr=-sse | FileCheck %s --check-prefixes=X64-NOSSE,FASTISEL-X64-NOSSE
+; RUN: llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=2 -mattr=-sse | FileCheck %s --check-prefixes=X64-NOSSE,GISEL-X64-NOSSE
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86,SDAG-X86
+; RUN: llc < %s -mtriple=i686-- -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-- -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=x86_64-- -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64
+
+declare void @llvm.set.rounding(i32 %x)
+
+define void @func_01() nounwind {
+; X86-NOSSE-LABEL: func_01:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    fnstcw (%esp)
+; X86-NOSSE-NEXT:    orb $12, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldcw (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    retl
+;
+; X64-NOSSE-LABEL: func_01:
+; X64-NOSSE:       # %bb.0:
+; X64-NOSSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    orb $12, -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    retq
+;
+; X86-LABEL: func_01:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    fnstcw (%esp)
+; X86-NEXT:    orb $12, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldcw (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: func_01:
+; X64:       # %bb.0:
+; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    orb $12, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    orb $96, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  call void @llvm.set.rounding(i32 0)  ; TowardZero (CW[11-10] = 11)
+  ret void
+}
+
+define void @func_02() nounwind {
+; X86-NOSSE-LABEL: func_02:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    fnstcw (%esp)
+; X86-NOSSE-NEXT:    andb $-13, {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fldcw (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    retl
+;
+; X64-NOSSE-LABEL: func_02:
+; X64-NOSSE:       # %bb.0:
+; X64-NOSSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    andb $-13, -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    retq
+;
+; X86-LABEL: func_02:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    fnstcw (%esp)
+; X86-NEXT:    andb $-13, {{[0-9]+}}(%esp)
+; X86-NEXT:    fldcw (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: func_02:
+; X64:       # %bb.0:
+; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andb $-13, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andb $-97, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  call void @llvm.set.rounding(i32 1)  ; ToNearestTiesToEven (CW[11-10] = 00)
+  ret void
+}
+
+define void @func_03() nounwind {
+; X86-NOSSE-LABEL: func_03:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    fnstcw (%esp)
+; X86-NOSSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X86-NOSSE-NEXT:    andl (%esp), %eax
+; X86-NOSSE-NEXT:    orl $2048, %eax # imm = 0x800
+; X86-NOSSE-NEXT:    movw %ax, (%esp)
+; X86-NOSSE-NEXT:    fldcw (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    retl
+;
+; X64-NOSSE-LABEL: func_03:
+; X64-NOSSE:       # %bb.0:
+; X64-NOSSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X64-NOSSE-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-NOSSE-NEXT:    orl $2048, %eax # imm = 0x800
+; X64-NOSSE-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    retq
+;
+; X86-LABEL: func_03:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    fnstcw (%esp)
+; X86-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X86-NEXT:    andl (%esp), %eax
+; X86-NEXT:    orl $2048, %eax # imm = 0x800
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    fldcw (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: func_03:
+; X64:       # %bb.0:
+; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    orl $2048, %eax # imm = 0x800
+; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $-24577, %eax # imm = 0x9FFF
+; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    orl $16384, %eax # imm = 0x4000
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  call void @llvm.set.rounding(i32 2)  ; Upward (CW[11-10] = 10)
+  ret void
+}
+
+define void @func_04() nounwind {
+; X86-NOSSE-LABEL: func_04:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    fnstcw (%esp)
+; X86-NOSSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X86-NOSSE-NEXT:    andl (%esp), %eax
+; X86-NOSSE-NEXT:    orl $1024, %eax # imm = 0x400
+; X86-NOSSE-NEXT:    movw %ax, (%esp)
+; X86-NOSSE-NEXT:    fldcw (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    retl
+;
+; X64-NOSSE-LABEL: func_04:
+; X64-NOSSE:       # %bb.0:
+; X64-NOSSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X64-NOSSE-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-NOSSE-NEXT:    orl $1024, %eax # imm = 0x400
+; X64-NOSSE-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    retq
+;
+; X86-LABEL: func_04:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    fnstcw (%esp)
+; X86-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X86-NEXT:    andl (%esp), %eax
+; X86-NEXT:    orl $1024, %eax # imm = 0x400
+; X86-NEXT:    movw %ax, (%esp)
+; X86-NEXT:    fldcw (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: func_04:
+; X64:       # %bb.0:
+; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $-3073, %eax # imm = 0xF3FF
+; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    orl $1024, %eax # imm = 0x400
+; X64-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $-24577, %eax # imm = 0x9FFF
+; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    orl $8192, %eax # imm = 0x2000
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  call void @llvm.set.rounding(i32 3)  ; Downward (CW[11-10] = 01)
+  ret void
+}
+
+define void @func_05(i32 %x) nounwind {
+; X86-NOSSE-LABEL: func_05:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    leal 4(%eax,%eax), %ecx
+; X86-NOSSE-NEXT:    movl $201, %eax
+; X86-NOSSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NOSSE-NEXT:    shll %cl, %eax
+; X86-NOSSE-NEXT:    andl $3072, %eax # imm = 0xC00
+; X86-NOSSE-NEXT:    fnstcw (%esp)
+; X86-NOSSE-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
+; X86-NOSSE-NEXT:    andl (%esp), %ecx
+; X86-NOSSE-NEXT:    orl %eax, %ecx
+; X86-NOSSE-NEXT:    movw %cx, (%esp)
+; X86-NOSSE-NEXT:    fldcw (%esp)
+; X86-NOSSE-NEXT:    popl %eax
+; X86-NOSSE-NEXT:    retl
+;
+; X64-NOSSE-LABEL: func_05:
+; X64-NOSSE:       # %bb.0:
+; X64-NOSSE-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NOSSE-NEXT:    leal 4(%rdi,%rdi), %ecx
+; X64-NOSSE-NEXT:    movl $201, %eax
+; X64-NOSSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOSSE-NEXT:    shll %cl, %eax
+; X64-NOSSE-NEXT:    andl $3072, %eax # imm = 0xC00
+; X64-NOSSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
+; X64-NOSSE-NEXT:    andl -{{[0-9]+}}(%rsp), %ecx
+; X64-NOSSE-NEXT:    orl %eax, %ecx
+; X64-NOSSE-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NOSSE-NEXT:    retq
+;
+; X86-LABEL: func_05:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal 4(%eax,%eax), %ecx
+; X86-NEXT:    movl $201, %eax
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    andl $3072, %eax # imm = 0xC00
+; X86-NEXT:    fnstcw (%esp)
+; X86-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
+; X86-NEXT:    andl (%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movw %cx, (%esp)
+; X86-NEXT:    fldcw (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: func_05:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal 4(%rdi,%rdi), %ecx
+; X64-NEXT:    movl $201, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    andl $3072, %eax # imm = 0xC00
+; X64-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $-3073, %ecx # imm = 0xF3FF
+; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    movw %cx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    fldcw -{{[0-9]+}}(%rsp)
+; X64-NEXT:    stmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movl $-24577, %ecx # imm = 0x9FFF
+; X64-NEXT:    andl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    leal (%rcx,%rax,8), %eax
+; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+  call void @llvm.set.rounding(i32 %x)  ; Downward
+  ret void
+}
+
+attributes #0 = { nounwind "use-soft-float"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FASTISEL-X64: {{.*}}
+; FASTISEL-X64-NOSSE: {{.*}}
+; FASTISEL-X86: {{.*}}
+; FASTISEL-X86-NOSSE: {{.*}}
+; GISEL-X64: {{.*}}
+; GISEL-X64-NOSSE: {{.*}}
+; GISEL-X86: {{.*}}
+; GISEL-X86-NOSSE: {{.*}}
+; SDAG-X64: {{.*}}
+; SDAG-X64-NOSSE: {{.*}}
+; SDAG-X86: {{.*}}
+; SDAG-X86-NOSSE: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llvm.sin.ll b/llvm/test/CodeGen/X86/isel-llvm.sin.ll
index 0f17f83d0102..0f17f83d0102 100644
--- a/llvm/test/CodeGen/X86/llvm.sin.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sin.ll
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 065710f91457..065710f91457 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
diff --git a/llvm/test/CodeGen/X86/llvm.sinh.ll b/llvm/test/CodeGen/X86/isel-llvm.sinh.ll
index ef30f8de0695..ef30f8de0695 100644
--- a/llvm/test/CodeGen/X86/llvm.sinh.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sinh.ll
diff --git a/llvm/test/CodeGen/X86/llvm.tan.ll b/llvm/test/CodeGen/X86/isel-llvm.tan.ll
index 4e76653cd129..4e76653cd129 100644
--- a/llvm/test/CodeGen/X86/llvm.tan.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.tan.ll
diff --git a/llvm/test/CodeGen/X86/llvm.tanh.ll b/llvm/test/CodeGen/X86/isel-llvm.tanh.ll
index c4f6e2f179cf..c4f6e2f179cf 100644
--- a/llvm/test/CodeGen/X86/llvm.tanh.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.tanh.ll
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
index cab810d30cd7..8b1e69a97d54 100644
--- a/llvm/test/CodeGen/X86/kmov.ll
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -143,6 +143,57 @@ define <8 x i1> @invert_i8_mask_extract_8(i8 %mask) {
   ret <8 x i1> %cmp.45
 }
 
+define <8 x i1> @i8_mask_extract_7(i8 %mask) {
+; X64-AVX512-LABEL: i8_mask_extract_7:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    shrb %dil
+; X64-AVX512-NEXT:    movzbl %dil, %eax
+; X64-AVX512-NEXT:    kmovd %eax, %k0
+; X64-AVX512-NEXT:    vpmovm2w %k0, %xmm0
+; X64-AVX512-NEXT:    retq
+;
+; X64-KNL-LABEL: i8_mask_extract_7:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    vmovd %edi, %xmm0
+; X64-KNL-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,4,8,16,32,64,128,0,2,4,8,16,32,64,128,0]
+; X64-KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-KNL-NEXT:    retq
+  %.splatinsert = insertelement <8 x i8> poison, i8 %mask, i64 0
+  %.splat = shufflevector <8 x i8> %.splatinsert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %1 = and <8 x i8> %.splat, <i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 poison>
+  %cmp.45 = icmp ne <8 x i8> %1, zeroinitializer
+  ret <8 x i1> %cmp.45
+}
+
+define <8 x i1> @invert_i8_mask_extract_7(i8 %mask) {
+; X64-AVX512-LABEL: invert_i8_mask_extract_7:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    shrb %dil
+; X64-AVX512-NEXT:    movzbl %dil, %eax
+; X64-AVX512-NEXT:    kmovd %eax, %k0
+; X64-AVX512-NEXT:    knotb %k0, %k0
+; X64-AVX512-NEXT:    vpmovm2w %k0, %xmm0
+; X64-AVX512-NEXT:    retq
+;
+; X64-KNL-LABEL: invert_i8_mask_extract_7:
+; X64-KNL:       # %bb.0:
+; X64-KNL-NEXT:    vmovd %edi, %xmm0
+; X64-KNL-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-KNL-NEXT:    retq
+  %.splatinsert = insertelement <8 x i8> poison, i8 %mask, i64 0
+  %.splat = shufflevector <8 x i8> %.splatinsert, <8 x i8> poison, <8 x i32> zeroinitializer
+  %1 = and <8 x i8> %.splat, <i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 poison>
+  %cmp.45 = icmp eq <8 x i8> %1, zeroinitializer
+  ret <8 x i1> %cmp.45
+}
+
 define <4 x i1> @i16_mask_extract_4(i16 %mask) {
 ; X64-AVX512-LABEL: i16_mask_extract_4:
 ; X64-AVX512:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/llrint-conv.ll b/llvm/test/CodeGen/X86/llrint-conv.ll
index 7bcf57311853..5f38645f7463 100644
--- a/llvm/test/CodeGen/X86/llrint-conv.ll
+++ b/llvm/test/CodeGen/X86/llrint-conv.ll
@@ -7,47 +7,15 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
 
-define i64 @testmsxh(half %x) nounwind {
-; X86-NOSSE-LABEL: testmsxh:
-; X86-NOSSE:       # %bb.0: # %entry
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    calll __extendhfsf2
-; X86-NOSSE-NEXT:    fstps (%esp)
-; X86-NOSSE-NEXT:    calll llrintf
-; X86-NOSSE-NEXT:    popl %ecx
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: testmsxh:
-; X86-SSE2:       # %bb.0: # %entry
-; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
-; X86-SSE2-NEXT:    pextrw $0, %xmm0, %eax
-; X86-SSE2-NEXT:    movw %ax, (%esp)
-; X86-SSE2-NEXT:    calll __extendhfsf2
-; X86-SSE2-NEXT:    fstps (%esp)
-; X86-SSE2-NEXT:    calll llrintf
-; X86-SSE2-NEXT:    popl %ecx
-; X86-SSE2-NEXT:    retl
-;
-; X64-SSE-LABEL: testmsxh:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    callq __extendhfsf2@PLT
-; X64-SSE-NEXT:    callq rintf@PLT
-; X64-SSE-NEXT:    callq __truncsfhf2@PLT
-; X64-SSE-NEXT:    callq __extendhfsf2@PLT
-; X64-SSE-NEXT:    cvttss2si %xmm0, %rax
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.llrint.i64.f16(half %x)
-  ret i64 %0
-}
+; FIXME: crash
+; define i64 @test_llrint_i64_f16(half %x) nounwind {
+; entry:
+;   %0 = tail call i64 @llvm.llrint.i64.f16(half %x)
+;   ret i64 %0
+; }
 
-define i64 @testmsxs(float %x) nounwind {
-; X86-NOSSE-LABEL: testmsxs:
+define i64 @test_llrint_i64_f32(float %x) nounwind {
+; X86-NOSSE-LABEL: test_llrint_i64_f32:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
@@ -61,7 +29,7 @@ define i64 @testmsxs(float %x) nounwind {
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
-; X86-SSE2-LABEL: testmsxs:
+; X86-SSE2-LABEL: test_llrint_i64_f32:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
@@ -77,7 +45,7 @@ define i64 @testmsxs(float %x) nounwind {
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: testmsxs:
+; X86-AVX-LABEL: test_llrint_i64_f32:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %ebp
 ; X86-AVX-NEXT:    movl %esp, %ebp
@@ -93,12 +61,12 @@ define i64 @testmsxs(float %x) nounwind {
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
 ;
-; X64-SSE-LABEL: testmsxs:
+; X64-SSE-LABEL: test_llrint_i64_f32:
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: testmsxs:
+; X64-AVX-LABEL: test_llrint_i64_f32:
 ; X64-AVX:       # %bb.0: # %entry
 ; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
 ; X64-AVX-NEXT:    retq
@@ -107,8 +75,8 @@ entry:
   ret i64 %0
 }
 
-define i64 @testmsxd(double %x) nounwind {
-; X86-NOSSE-LABEL: testmsxd:
+define i64 @test_llrint_i64_f64(double %x) nounwind {
+; X86-NOSSE-LABEL: test_llrint_i64_f64:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
@@ -122,7 +90,7 @@ define i64 @testmsxd(double %x) nounwind {
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
-; X86-SSE2-LABEL: testmsxd:
+; X86-SSE2-LABEL: test_llrint_i64_f64:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
@@ -138,7 +106,7 @@ define i64 @testmsxd(double %x) nounwind {
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: testmsxd:
+; X86-AVX-LABEL: test_llrint_i64_f64:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %ebp
 ; X86-AVX-NEXT:    movl %esp, %ebp
@@ -154,12 +122,12 @@ define i64 @testmsxd(double %x) nounwind {
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
 ;
-; X64-SSE-LABEL: testmsxd:
+; X64-SSE-LABEL: test_llrint_i64_f64:
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: testmsxd:
+; X64-AVX-LABEL: test_llrint_i64_f64:
 ; X64-AVX:       # %bb.0: # %entry
 ; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
 ; X64-AVX-NEXT:    retq
@@ -168,8 +136,8 @@ entry:
   ret i64 %0
 }
 
-define i64 @testmsll(x86_fp80 %x) nounwind {
-; X86-LABEL: testmsll:
+define i64 @test_llrint_i64_f80(x86_fp80 %x) nounwind {
+; X86-LABEL: test_llrint_i64_f80:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
@@ -183,7 +151,7 @@ define i64 @testmsll(x86_fp80 %x) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: testmsll:
+; X64-LABEL: test_llrint_i64_f80:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fistpll -{{[0-9]+}}(%rsp)
@@ -195,8 +163,8 @@ entry:
 }
 
 ; FIXME(#44744): incorrect libcall
-define i64 @testmslq(fp128 %x) nounwind {
-; X86-NOSSE-LABEL: testmslq:
+define i64 @test_llrint_i64_f128(fp128 %x) nounwind {
+; X86-NOSSE-LABEL: test_llrint_i64_f128:
 ; X86-NOSSE:       # %bb.0: # %entry
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
@@ -212,7 +180,7 @@ define i64 @testmslq(fp128 %x) nounwind {
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
-; X86-SSE2-LABEL: testmslq:
+; X86-SSE2-LABEL: test_llrint_i64_f128:
 ; X86-SSE2:       # %bb.0: # %entry
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
@@ -228,7 +196,7 @@ define i64 @testmslq(fp128 %x) nounwind {
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: testmslq:
+; X86-AVX-LABEL: test_llrint_i64_f128:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %ebp
 ; X86-AVX-NEXT:    movl %esp, %ebp
@@ -241,11 +209,181 @@ define i64 @testmslq(fp128 %x) nounwind {
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
 ;
-; X64-LABEL: testmslq:
+; X64-LABEL: test_llrint_i64_f128:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    jmp llrintl@PLT # TAILCALL
 entry:
-  %0 = tail call i64 @llvm.llrint.i64.fp128(fp128 %x)
+  %0 = tail call i64 @llvm.llrint.i64.f128(fp128 %x)
+  ret i64 %0
+}
+
+; FIXME: crash
+; define i64 @test_llrint_i64_f16_strict(half %x) nounwind strictfp {
+; entry:
+;   %0 = tail call i64 @llvm.experimental.constrained.llrint.i64.f16(half %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %0
+; }
+
+define i64 @test_llrint_i64_f32_strict(float %x) nounwind strictfp {
+; X86-NOSSE-LABEL: test_llrint_i64_f32_strict:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    wait
+; X86-NOSSE-NEXT:    calll llrintf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_llrint_i64_f32_strict:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll llrintf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: test_llrint_i64_f32_strict:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    calll llrintf
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: test_llrint_i64_f32_strict:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq llrintf@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llrint.i64.f32(float %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %0
+}
+
+define i64 @test_llrint_i64_f64_strict(double %x) nounwind strictfp {
+; X86-NOSSE-LABEL: test_llrint_i64_f64_strict:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl (%esp)
+; X86-NOSSE-NEXT:    wait
+; X86-NOSSE-NEXT:    calll llrint
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_llrint_i64_f64_strict:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll llrint
+; X86-SSE2-NEXT:    addl $8, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: test_llrint_i64_f64_strict:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    calll llrint
+; X86-AVX-NEXT:    addl $8, %esp
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: test_llrint_i64_f64_strict:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq llrint@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llrint.i64.f64(double %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %0
+}
+
+define i64 @test_llrint_i64_f80_strict(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: test_llrint_i64_f80_strict:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_llrint_i64_f80_strict:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq llrintl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %0
+}
+
+; FIXME(#44744): incorrect libcall
+define i64 @test_llrint_i64_f128_strict(fp128 %x) nounwind strictfp {
+; X86-NOSSE-LABEL: test_llrint_i64_f128_strict:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    pushl 20(%ebp)
+; X86-NOSSE-NEXT:    pushl 16(%ebp)
+; X86-NOSSE-NEXT:    pushl 12(%ebp)
+; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    calll llrintl
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_llrint_i64_f128_strict:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll llrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: test_llrint_i64_f128_strict:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll llrintl
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: test_llrint_i64_f128_strict:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq llrintl@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
   ret i64 %0
 }
 
diff --git a/llvm/test/CodeGen/X86/llround-conv.ll b/llvm/test/CodeGen/X86/llround-conv.ll
index 19a980b72809..ef4df82e9e57 100644
--- a/llvm/test/CodeGen/X86/llround-conv.ll
+++ b/llvm/test/CodeGen/X86/llround-conv.ll
@@ -1,88 +1,84 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown             | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=i686-unknown              | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2  | FileCheck %s --check-prefixes=X86,X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown            | FileCheck %s --check-prefixes=X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
 
-define i64 @testmsxs(float %x) {
-; X86-LABEL: testmsxs:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp)
-; X86-NEXT:    calll llroundf
-; X86-NEXT:    popl %ecx
-; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
+; FIXME: crash
+; define i64 @test_llround_f16(half %x) nounwind {
+;   %conv = tail call i64 @llvm.llround.f16(half %x)
+;   ret i64 %conv
+; }
+
+define i64 @test_llround_f32(float %x) nounwind {
+; X86-NOSSE-LABEL: test_llround_f32:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    calll llroundf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
 ;
-; SSE2-LABEL: testmsxs:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    pushl %eax
-; SSE2-NEXT:    .cfi_def_cfa_offset 8
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss %xmm0, (%esp)
-; SSE2-NEXT:    calll llroundf
-; SSE2-NEXT:    popl %ecx
-; SSE2-NEXT:    .cfi_def_cfa_offset 4
-; SSE2-NEXT:    retl
+; X86-SSE2-LABEL: test_llround_f32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll llroundf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
 ;
-; GISEL-X86-LABEL: testmsxs:
-; GISEL-X86:       # %bb.0: # %entry
+; X64-LABEL: test_llround_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llroundf@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_llround_f32:
+; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl %eax, (%esp)
 ; GISEL-X86-NEXT:    calll llroundf
 ; GISEL-X86-NEXT:    addl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 4
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: testmsxs:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    jmp llroundf@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsxs:
-; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-LABEL: test_llround_f32:
+; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    pushq %rax
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X64-NEXT:    callq llroundf
 ; GISEL-X64-NEXT:    popq %rcx
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 8
 ; GISEL-X64-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.llround.f32(float %x)
+  ret i64 %conv
 }
 
-define i64 @testmsxd(double %x) {
-; X86-LABEL: testmsxd:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    fstpl (%esp)
-; X86-NEXT:    calll llround
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
+define i64 @test_llround_f64(double %x) nounwind {
+; X86-NOSSE-LABEL: test_llround_f64:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl (%esp)
+; X86-NOSSE-NEXT:    calll llround
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
 ;
-; SSE2-LABEL: testmsxd:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    subl $8, %esp
-; SSE2-NEXT:    .cfi_def_cfa_offset 12
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movsd %xmm0, (%esp)
-; SSE2-NEXT:    calll llround
-; SSE2-NEXT:    addl $8, %esp
-; SSE2-NEXT:    .cfi_def_cfa_offset 4
-; SSE2-NEXT:    retl
+; X86-SSE2-LABEL: test_llround_f64:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll llround
+; X86-SSE2-NEXT:    addl $8, %esp
+; X86-SSE2-NEXT:    retl
 ;
-; GISEL-X86-LABEL: testmsxd:
-; GISEL-X86:       # %bb.0: # %entry
+; X64-LABEL: test_llround_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llround@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_llround_f64:
+; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; GISEL-X86-NEXT:    movl 4(%eax), %eax
@@ -92,111 +88,140 @@ define i64 @testmsxd(double %x) {
 ; GISEL-X86-NEXT:    movl %eax, 4(%edx)
 ; GISEL-X86-NEXT:    calll llround
 ; GISEL-X86-NEXT:    addl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 4
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: testmsxd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    jmp llround@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsxd:
-; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-LABEL: test_llround_f64:
+; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    pushq %rax
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X64-NEXT:    callq llround
 ; GISEL-X64-NEXT:    popq %rcx
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 8
 ; GISEL-X64-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.llround.f64(double %x)
+  ret i64 %conv
 }
 
-define i64 @testmsll(x86_fp80 %x) {
-; X86-LABEL: testmsll:
-; X86:       # %bb.0: # %entry
+define i64 @test_llround_f80(x86_fp80 %x) nounwind {
+; X86-LABEL: test_llround_f80:
+; X86:       # %bb.0:
 ; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstpt (%esp)
 ; X86-NEXT:    calll llroundl
 ; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
-; SSE2-LABEL: testmsll:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    subl $12, %esp
-; SSE2-NEXT:    .cfi_def_cfa_offset 16
-; SSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; SSE2-NEXT:    fstpt (%esp)
-; SSE2-NEXT:    calll llroundl
-; SSE2-NEXT:    addl $12, %esp
-; SSE2-NEXT:    .cfi_def_cfa_offset 4
-; SSE2-NEXT:    retl
+; X64-LABEL: test_llround_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llroundl@PLT # TAILCALL
 ;
-; GISEL-X86-LABEL: testmsll:
-; GISEL-X86:       # %bb.0: # %entry
+; GISEL-X86-LABEL: test_llround_f80:
+; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fstpt (%esp)
 ; GISEL-X86-NEXT:    calll llroundl
 ; GISEL-X86-NEXT:    addl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 4
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: testmsll:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    jmp llroundl@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsll:
-; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-LABEL: test_llround_f80:
+; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    subq $24, %rsp
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 32
 ; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; GISEL-X64-NEXT:    fstpt (%rsp)
 ; GISEL-X64-NEXT:    callq llroundl
 ; GISEL-X64-NEXT:    addq $24, %rsp
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 8
 ; GISEL-X64-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.llround.f80(x86_fp80 %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.llround.f80(x86_fp80 %x)
+  ret i64 %conv
 }
 
-define i64 @test_llround_i64_f32(float %x) nounwind {
-; X86-LABEL: test_llround_i64_f32:
+; FIXME(#44744): incorrect libcall
+define i64 @test_llround_f128(fp128 %x) nounwind {
+; X86-LABEL: test_llround_f128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    fstps (%esp)
-; X86-NEXT:    calll llroundf
-; X86-NEXT:    popl %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll llroundl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE2-LABEL: test_llround_i64_f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pushl %eax
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss %xmm0, (%esp)
-; SSE2-NEXT:    calll llroundf
-; SSE2-NEXT:    popl %ecx
-; SSE2-NEXT:    retl
+; X64-LABEL: test_llround_f128:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llroundl@PLT # TAILCALL
 ;
-; GISEL-X86-LABEL: test_llround_i64_f32:
+; GISEL-X86-LABEL: test_llround_f128:
 ; GISEL-X86:       # %bb.0:
-; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    subl $24, %esp
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; GISEL-X86-NEXT:    movl %eax, (%esp)
-; GISEL-X86-NEXT:    calll llroundf
-; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    calll llroundf128
+; GISEL-X86-NEXT:    addl $24, %esp
+; GISEL-X86-NEXT:    popl %esi
 ; GISEL-X86-NEXT:    retl
 ;
+; GISEL-X64-LABEL: test_llround_f128:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq llroundf128
+; GISEL-X64-NEXT:    popq %rcx
+; GISEL-X64-NEXT:    retq
+  %conv = tail call i64 @llvm.llround.f128(fp128 %x)
+  ret i64 %conv
+}
+
+; FIXME: crash
+; define i64 @test_llround_i64_f16(half %x) nounwind {
+;   %conv = call i64 @llvm.llround.i64.f16(half %x)
+;   ret i64 %conv
+; }
+
+define i64 @test_llround_i64_f32(float %x) nounwind {
+; X86-NOSSE-LABEL: test_llround_i64_f32:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    calll llroundf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_llround_i64_f32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll llroundf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
+;
 ; X64-LABEL: test_llround_i64_f32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    jmp llroundf@PLT # TAILCALL
 ;
+; GISEL-X86-LABEL: test_llround_i64_f32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    subl $12, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    calll llroundf
+; GISEL-X86-NEXT:    addl $12, %esp
+; GISEL-X86-NEXT:    retl
+;
 ; GISEL-X64-LABEL: test_llround_i64_f32:
 ; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    pushq %rax
@@ -208,23 +233,27 @@ define i64 @test_llround_i64_f32(float %x) nounwind {
 }
 
 define i64 @test_llround_i64_f64(double %x) nounwind {
-; X86-LABEL: test_llround_i64_f64:
-; X86:       # %bb.0:
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    fldl {{[0-9]+}}(%esp)
-; X86-NEXT:    fstpl (%esp)
-; X86-NEXT:    calll llround
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    retl
+; X86-NOSSE-LABEL: test_llround_i64_f64:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl (%esp)
+; X86-NOSSE-NEXT:    calll llround
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_llround_i64_f64:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll llround
+; X86-SSE2-NEXT:    addl $8, %esp
+; X86-SSE2-NEXT:    retl
 ;
-; SSE2-LABEL: test_llround_i64_f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    subl $8, %esp
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movsd %xmm0, (%esp)
-; SSE2-NEXT:    calll llround
-; SSE2-NEXT:    addl $8, %esp
-; SSE2-NEXT:    retl
+; X64-LABEL: test_llround_i64_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llround@PLT # TAILCALL
 ;
 ; GISEL-X86-LABEL: test_llround_i64_f64:
 ; GISEL-X86:       # %bb.0:
@@ -240,10 +269,6 @@ define i64 @test_llround_i64_f64(double %x) nounwind {
 ; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: test_llround_i64_f64:
-; X64:       # %bb.0:
-; X64-NEXT:    jmp llround@PLT # TAILCALL
-;
 ; GISEL-X64-LABEL: test_llround_i64_f64:
 ; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    pushq %rax
@@ -264,14 +289,9 @@ define i64 @test_llround_i64_f80(x86_fp80 %x) nounwind {
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
-; SSE2-LABEL: test_llround_i64_f80:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    subl $12, %esp
-; SSE2-NEXT:    fldt {{[0-9]+}}(%esp)
-; SSE2-NEXT:    fstpt (%esp)
-; SSE2-NEXT:    calll llroundl
-; SSE2-NEXT:    addl $12, %esp
-; SSE2-NEXT:    retl
+; X64-LABEL: test_llround_i64_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llroundl@PLT # TAILCALL
 ;
 ; GISEL-X86-LABEL: test_llround_i64_f80:
 ; GISEL-X86:       # %bb.0:
@@ -282,10 +302,6 @@ define i64 @test_llround_i64_f80(x86_fp80 %x) nounwind {
 ; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: test_llround_i64_f80:
-; X64:       # %bb.0:
-; X64-NEXT:    jmp llroundl@PLT # TAILCALL
-;
 ; GISEL-X64-LABEL: test_llround_i64_f80:
 ; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    subq $24, %rsp
@@ -297,3 +313,79 @@ define i64 @test_llround_i64_f80(x86_fp80 %x) nounwind {
   %conv = call i64 @llvm.llround.i64.f80(x86_fp80 %x)
   ret i64 %conv
 }
+
+; FIXME(#44744): incorrect libcall
+define i64 @test_llround_i64_f128(fp128 %x) nounwind {
+; X86-LABEL: test_llround_i64_f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll llroundl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_llround_i64_f128:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp llroundl@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_llround_i64_f128:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    subl $24, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    calll llroundf128
+; GISEL-X86-NEXT:    addl $24, %esp
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: test_llround_i64_f128:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq llroundf128
+; GISEL-X64-NEXT:    popq %rcx
+; GISEL-X64-NEXT:    retq
+  %conv = call i64 @llvm.llround.i64.f128(fp128 %x)
+  ret i64 %conv
+}
+
+; FIXME: not yet implemented for global isel
+; define i64 @test_llround_i64_f16_strict(half %x) nounwind strictfp {
+;   %conv = call i64 @llvm.experimental.constrained.llround.i64.f16(half %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %conv
+; }
+
+; define i64 @test_llround_i64_f32_strict(float %x) nounwind strictfp {
+;   %conv = call i64 @llvm.experimental.constrained.llround.i64.f32(float %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %conv
+; }
+
+; define i64 @test_llround_i64_f64_strict(double %x) nounwind strictfp {
+;   %conv = call i64 @llvm.experimental.constrained.llround.i64.f64(double %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %conv
+; }
+
+; define i64 @test_llround_i64_f80_strict(x86_fp80 %x) nounwind strictfp {
+;   %conv = call i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %conv
+; }
+
+; ; FIXME(#44744): incorrect libcall
+; define i64 @test_llround_i64_f128_strict(fp128 %x) nounwind strictfp {
+;   %conv = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %conv
+; }
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i32.ll b/llvm/test/CodeGen/X86/lrint-conv-i32.ll
index 3c50aea1095f..2b99b4c50f58 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i32.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i32.ll
@@ -8,15 +8,15 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
 
 ; FIXME: crash
-; define i32 @testmswh(half %x) nounwind {
+; define i32 @test_lrint_i32_f16(half %x) nounwind {
 ; entry:
 ;   %0 = tail call i32 @llvm.lrint.i32.f16(half %x)
 ;   ret i32 %0
 ; }
 
-define i32 @testmsws(float %x) nounwind {
-; X86-NOSSE-LABEL: testmsws:
-; X86-NOSSE:       # %bb.0: # %entry
+define i32 @test_lrint_i32_f32(float %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i32_f32:
+; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpl (%esp)
@@ -24,33 +24,32 @@ define i32 @testmsws(float %x) nounwind {
 ; X86-NOSSE-NEXT:    popl %ecx
 ; X86-NOSSE-NEXT:    retl
 ;
-; X86-SSE2-LABEL: testmsws:
-; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-LABEL: test_lrint_i32_f32:
+; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtss2si {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: testmsws:
-; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-LABEL: test_lrint_i32_f32:
+; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    vcvtss2si {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    retl
 ;
-; X64-SSE-LABEL: testmsws:
-; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-LABEL: test_lrint_i32_f32:
+; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    cvtss2si %xmm0, %eax
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: testmsws:
-; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-LABEL: test_lrint_i32_f32:
+; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vcvtss2si %xmm0, %eax
 ; X64-AVX-NEXT:    retq
-entry:
-  %0 = tail call i32 @llvm.lrint.i32.f32(float %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lrint.i32.f32(float %x)
+  ret i32 %conv
 }
 
-define i32 @testmswd(double %x) nounwind {
-; X86-NOSSE-LABEL: testmswd:
-; X86-NOSSE:       # %bb.0: # %entry
+define i32 @test_lrint_i32_f64(double %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i32_f64:
+; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %eax
 ; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NOSSE-NEXT:    fistpl (%esp)
@@ -58,33 +57,32 @@ define i32 @testmswd(double %x) nounwind {
 ; X86-NOSSE-NEXT:    popl %ecx
 ; X86-NOSSE-NEXT:    retl
 ;
-; X86-SSE2-LABEL: testmswd:
-; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-LABEL: test_lrint_i32_f64:
+; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    cvtsd2si {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: testmswd:
-; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-LABEL: test_lrint_i32_f64:
+; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    vcvtsd2si {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    retl
 ;
-; X64-SSE-LABEL: testmswd:
-; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-LABEL: test_lrint_i32_f64:
+; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    cvtsd2si %xmm0, %eax
 ; X64-SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: testmswd:
-; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-LABEL: test_lrint_i32_f64:
+; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vcvtsd2si %xmm0, %eax
 ; X64-AVX-NEXT:    retq
-entry:
-  %0 = tail call i32 @llvm.lrint.i32.f64(double %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lrint.i32.f64(double %x)
+  ret i32 %conv
 }
 
-define i32 @testmsll(x86_fp80 %x) nounwind {
-; X86-LABEL: testmsll:
-; X86:       # %bb.0: # %entry
+define i32 @test_lrint_i32_f80(x86_fp80 %x) nounwind {
+; X86-LABEL: test_lrint_i32_f80:
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fistpl (%esp)
@@ -92,21 +90,20 @@ define i32 @testmsll(x86_fp80 %x) nounwind {
 ; X86-NEXT:    popl %ecx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: testmsll:
-; X64:       # %bb.0: # %entry
+; X64-LABEL: test_lrint_i32_f80:
+; X64:       # %bb.0:
 ; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fistpl -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    retq
-entry:
-  %0 = tail call i32 @llvm.lrint.i32.f80(x86_fp80 %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lrint.i32.f80(x86_fp80 %x)
+  ret i32 %conv
 }
 
 ; FIXME(#44744): incorrect libcall
-define i32 @testmswq(fp128 %x) nounwind {
-; X86-NOSSE-LABEL: testmswq:
-; X86-NOSSE:       # %bb.0: # %entry
+define i32 @test_lrint_i32_f128(fp128 %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i32_f128:
+; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    andl $-16, %esp
@@ -121,8 +118,8 @@ define i32 @testmswq(fp128 %x) nounwind {
 ; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl
 ;
-; X86-SSE2-LABEL: testmswq:
-; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-LABEL: test_lrint_i32_f128:
+; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
@@ -137,8 +134,8 @@ define i32 @testmswq(fp128 %x) nounwind {
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX-LABEL: testmswq:
-; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-LABEL: test_lrint_i32_f128:
+; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    pushl %ebp
 ; X86-AVX-NEXT:    movl %esp, %ebp
 ; X86-AVX-NEXT:    andl $-16, %esp
@@ -150,12 +147,176 @@ define i32 @testmswq(fp128 %x) nounwind {
 ; X86-AVX-NEXT:    popl %ebp
 ; X86-AVX-NEXT:    retl
 ;
-; X64-LABEL: testmswq:
-; X64:       # %bb.0: # %entry
+; X64-LABEL: test_lrint_i32_f128:
+; X64:       # %bb.0:
 ; X64-NEXT:    jmp lrintl@PLT # TAILCALL
-entry:
-  %0 = tail call i32 @llvm.lrint.i32.f128(fp128 %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lrint.i32.f128(fp128 %x)
+  ret i32 %conv
+}
+
+; FIXME: crash
+; define i32 @test_lrint_i32_f16_strict(half %x) nounwind strictfp {
+;   %conv = tail call i32 @llvm.experimental.constrained.lrint.i32.f16(half %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i32 %conv
+; }
+
+define i32 @test_lrint_i32_f32_strict(float %x) nounwind strictfp {
+; X86-NOSSE-LABEL: test_lrint_i32_f32_strict:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    wait
+; X86-NOSSE-NEXT:    calll lrintf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i32_f32_strict:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll lrintf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: test_lrint_i32_f32_strict:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintf
+; X86-AVX-NEXT:    popl %ecx
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: test_lrint_i32_f32_strict:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq lrintf@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %conv = tail call i32 @llvm.experimental.constrained.lrint.i32.f32(float %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i32 %conv
+}
+
+define i32 @test_lrint_i32_f64_strict(double %x) nounwind strictfp {
+; X86-NOSSE-LABEL: test_lrint_i32_f64_strict:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl (%esp)
+; X86-NOSSE-NEXT:    wait
+; X86-NOSSE-NEXT:    calll lrint
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i32_f64_strict:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll lrint
+; X86-SSE2-NEXT:    addl $8, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: test_lrint_i32_f64_strict:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrint
+; X86-AVX-NEXT:    addl $8, %esp
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: test_lrint_i32_f64_strict:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq lrint@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %conv = tail call i32 @llvm.experimental.constrained.lrint.i32.f64(double %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i32 %conv
+}
+
+define i32 @test_lrint_i32_f80_strict(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: test_lrint_i32_f80_strict:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll lrintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lrint_i32_f80_strict:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq lrintl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+  %conv = tail call i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i32 %conv
+}
+
+; FIXME(#44744): incorrect libcall
+define i32 @test_lrint_i32_f128_strict(fp128 %x) nounwind strictfp {
+; X86-NOSSE-LABEL: test_lrint_i32_f128_strict:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    pushl 20(%ebp)
+; X86-NOSSE-NEXT:    pushl 16(%ebp)
+; X86-NOSSE-NEXT:    pushl 12(%ebp)
+; X86-NOSSE-NEXT:    pushl 8(%ebp)
+; X86-NOSSE-NEXT:    calll lrintl
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i32_f128_strict:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    pushl 20(%ebp)
+; X86-SSE2-NEXT:    pushl 16(%ebp)
+; X86-SSE2-NEXT:    pushl 12(%ebp)
+; X86-SSE2-NEXT:    pushl 8(%ebp)
+; X86-SSE2-NEXT:    calll lrintl
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: test_lrint_i32_f128_strict:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    andl $-16, %esp
+; X86-AVX-NEXT:    subl $32, %esp
+; X86-AVX-NEXT:    vmovups 8(%ebp), %xmm0
+; X86-AVX-NEXT:    vmovups %xmm0, (%esp)
+; X86-AVX-NEXT:    calll lrintl
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    retl
+;
+; X64-LABEL: test_lrint_i32_f128_strict:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq lrintl@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %conv = tail call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i32 %conv
 }
 
 declare i32 @llvm.lrint.i32.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i64.ll b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
index 2ba1500df0b6..731c03bf0d74 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
@@ -1,92 +1,311 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown              | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2  | FileCheck %s --check-prefixes=X86,X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX
 
-define i64 @testmsxh(half %x) nounwind {
-; SSE-LABEL: testmsxh:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    callq __extendhfsf2@PLT
-; SSE-NEXT:    callq rintf@PLT
-; SSE-NEXT:    callq __truncsfhf2@PLT
-; SSE-NEXT:    callq __extendhfsf2@PLT
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.lrint.i64.f16(half %x)
-  ret i64 %0
-}
+; FIXME: crash
+; define i64 @test_lrint_i64_f16(half %x) nounwind {
+;   %conv = tail call i64 @llvm.lrint.i64.f16(half %x)
+;   ret i64 %conv
+; }
 
-define i64 @testmsxs(float %x) nounwind {
-; SSE-LABEL: testmsxs:
-; SSE:       # %bb.0: # %entry
+define i64 @test_lrint_i64_f32(float %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i64_f32:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    flds 8(%ebp)
+; X86-NOSSE-NEXT:    fistpll (%esp)
+; X86-NOSSE-NEXT:    movl (%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i64_f32:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    flds (%esp)
+; X86-SSE2-NEXT:    fistpll (%esp)
+; X86-SSE2-NEXT:    movl (%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; SSE-LABEL: test_lrint_i64_f32:
+; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: testmsxs:
-; AVX:       # %bb.0: # %entry
+; AVX-LABEL: test_lrint_i64_f32:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtss2si %xmm0, %rax
 ; AVX-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.lrint.i64.f32(float %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.lrint.i64.f32(float %x)
+  ret i64 %conv
 }
 
-define i64 @testmsxd(double %x) nounwind {
-; SSE-LABEL: testmsxd:
-; SSE:       # %bb.0: # %entry
+define i64 @test_lrint_i64_f64(double %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i64_f64:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl 8(%ebp)
+; X86-NOSSE-NEXT:    fistpll (%esp)
+; X86-NOSSE-NEXT:    movl (%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i64_f64:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    fldl (%esp)
+; X86-SSE2-NEXT:    fistpll (%esp)
+; X86-SSE2-NEXT:    movl (%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
+; SSE-LABEL: test_lrint_i64_f64:
+; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtsd2si %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: testmsxd:
-; AVX:       # %bb.0: # %entry
+; AVX-LABEL: test_lrint_i64_f64:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtsd2si %xmm0, %rax
 ; AVX-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.lrint.i64.f64(double %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.lrint.i64.f64(double %x)
+  ret i64 %conv
 }
 
-define i64 @testmsll(x86_fp80 %x) nounwind {
-; CHECK-LABEL: testmsll:
-; CHECK:       # %bb.0: # %entry
+define i64 @test_lrint_i64_f80(x86_fp80 %x) nounwind {
+; X86-LABEL: test_lrint_i64_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    fldt 8(%ebp)
+; X86-NEXT:    fistpll (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; CHECK-LABEL: test_lrint_i64_f80:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fistpll -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; CHECK-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.lrint.i64.f80(x86_fp80 %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.lrint.i64.f80(x86_fp80 %x)
+  ret i64 %conv
 }
 
 ; FIXME(#44744): incorrect libcall
-define i64 @testmsxq(fp128 %x) nounwind {
-; CHECK-LABEL: testmsxq:
-; CHECK:       # %bb.0: # %entry
+define i64 @test_lrint_i64_f128(fp128 %x) nounwind {
+; X86-LABEL: test_lrint_i64_f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll lrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; CHECK-LABEL: test_lrint_i64_f128:
+; CHECK:       # %bb.0:
 ; CHECK-NEXT:    jmp lrintl@PLT # TAILCALL
-entry:
-  %0 = tail call i64 @llvm.lrint.i64.f128(fp128 %x)
-  ret i64 %0
+  %conv = tail call i64 @llvm.lrint.i64.f128(fp128 %x)
+  ret i64 %conv
+}
+
+; FIXME: crash
+; define i64 @test_lrint_i64_f16_strict(half %x) nounwind {
+;   %conv = tail call i64 @llvm.experimental.constrained.lrint.i64.f16(half %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %conv
+; }
+
+define i64 @test_lrint_i64_f32_strict(float %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i64_f32_strict:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    calll lrintf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i64_f32_strict:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll lrintf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
+;
+; CHECK-LABEL: test_lrint_i64_f32_strict:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq lrintf@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+  %conv = tail call i64 @llvm.experimental.constrained.lrint.i64.f32(float %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %conv
+}
+
+define i64 @test_lrint_i64_f64_strict(double %x) nounwind {
+; X86-NOSSE-LABEL: test_lrint_i64_f64_strict:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl (%esp)
+; X86-NOSSE-NEXT:    calll lrint
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lrint_i64_f64_strict:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll lrint
+; X86-SSE2-NEXT:    addl $8, %esp
+; X86-SSE2-NEXT:    retl
+;
+; CHECK-LABEL: test_lrint_i64_f64_strict:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq lrint@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+  %conv = tail call i64 @llvm.experimental.constrained.lrint.i64.f64(double %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %conv
+}
+
+define i64 @test_lrint_i64_f80_strict(x86_fp80 %x) nounwind {
+; X86-LABEL: test_lrint_i64_f80_strict:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll lrintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; CHECK-LABEL: test_lrint_i64_f80_strict:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt (%rsp)
+; CHECK-NEXT:    callq lrintl@PLT
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    retq
+  %conv = tail call i64 @llvm.experimental.constrained.lrint.i64.f80(x86_fp80 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %conv
+}
+
+; FIXME(#44744): incorrect libcall
+define i64 @test_lrint_i64_f128_strict(fp128 %x) nounwind {
+; X86-LABEL: test_lrint_i64_f128_strict:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll lrintl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; CHECK-LABEL: test_lrint_i64_f128_strict:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq lrintl@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+  %conv = tail call i64 @llvm.experimental.constrained.lrint.i64.f128(fp128 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+  ret i64 %conv
 }
 
 define i32 @PR125324(float %x) nounwind {
+; X86-NOSSE-LABEL: PR125324:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
+; X86-NOSSE-NEXT:    andl $-8, %esp
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    flds 8(%ebp)
+; X86-NOSSE-NEXT:    fistpll (%esp)
+; X86-NOSSE-NEXT:    movl (%esp), %eax
+; X86-NOSSE-NEXT:    movl %ebp, %esp
+; X86-NOSSE-NEXT:    popl %ebp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: PR125324:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-8, %esp
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    flds (%esp)
+; X86-SSE2-NEXT:    fistpll (%esp)
+; X86-SSE2-NEXT:    movl (%esp), %eax
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
+; X86-SSE2-NEXT:    retl
+;
 ; SSE-LABEL: PR125324:
-; SSE:       # %bb.0: # %entry
+; SSE:       # %bb.0:
 ; SSE-NEXT:    cvtss2si %xmm0, %rax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR125324:
-; AVX:       # %bb.0: # %entry
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtss2si %xmm0, %rax
 ; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX-NEXT:    retq
-entry:
-  %0 = tail call i64 @llvm.lrint.i64.f32(float %x)
-  %1 = trunc i64 %0 to i32
-  ret i32 %1
+  %conv = tail call i64 @llvm.lrint.i64.f32(float %x)
+  %trunc = trunc i64 %conv to i32
+  ret i32 %trunc
 }
 
 declare i64 @llvm.lrint.i64.f32(float) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/lround-conv-i32.ll b/llvm/test/CodeGen/X86/lround-conv-i32.ll
index c37536623143..389f29233dcc 100644
--- a/llvm/test/CodeGen/X86/lround-conv-i32.ll
+++ b/llvm/test/CodeGen/X86/lround-conv-i32.ll
@@ -1,17 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown             | FileCheck %s
-; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown              | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2  | FileCheck %s --check-prefixes=X86,X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown            | FileCheck %s --check-prefixes=X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
 
-define i32 @testmsws(float %x) nounwind {
-; CHECK-LABEL: testmsws:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp lroundf # TAILCALL
+; FIXME: crash
+; define i32 @test_lround_i32_f16(half %x) nounwind {
+;   %conv = tail call i32 @llvm.lround.i32.f16(half %x)
+;   ret i32 %conv
+; }
+
+define i32 @test_lround_i32_f32(float %x) nounwind {
+; X86-LABEL: test_lround_i32_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp lroundf # TAILCALL
+;
+; X64-LABEL: test_lround_i32_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp lroundf@PLT # TAILCALL
 ;
-; GISEL-X86-LABEL: testmsws:
-; GISEL-X86:       # %bb.0: # %entry
+; GISEL-X86-LABEL: test_lround_i32_f32:
+; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl %eax, (%esp)
@@ -19,28 +29,27 @@ define i32 @testmsws(float %x) nounwind {
 ; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: testmsws:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    jmp lroundf@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsws:
-; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-LABEL: test_lround_i32_f32:
+; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    pushq %rax
 ; GISEL-X64-NEXT:    callq lroundf
 ; GISEL-X64-NEXT:    popq %rcx
 ; GISEL-X64-NEXT:    retq
-entry:
-  %0 = tail call i32 @llvm.lround.i32.f32(float %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lround.i32.f32(float %x)
+  ret i32 %conv
 }
 
-define i32 @testmswd(double %x) nounwind {
-; CHECK-LABEL: testmswd:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp lround # TAILCALL
+define i32 @test_lround_i32_f64(double %x) nounwind {
+; X86-LABEL: test_lround_i32_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp lround # TAILCALL
+;
+; X64-LABEL: test_lround_i32_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp lround@PLT # TAILCALL
 ;
-; GISEL-X86-LABEL: testmswd:
-; GISEL-X86:       # %bb.0: # %entry
+; GISEL-X86-LABEL: test_lround_i32_f64:
+; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -53,28 +62,27 @@ define i32 @testmswd(double %x) nounwind {
 ; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: testmswd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    jmp lround@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmswd:
-; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-LABEL: test_lround_i32_f64:
+; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    pushq %rax
 ; GISEL-X64-NEXT:    callq lround
 ; GISEL-X64-NEXT:    popq %rcx
 ; GISEL-X64-NEXT:    retq
-entry:
-  %0 = tail call i32 @llvm.lround.i32.f64(double %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lround.i32.f64(double %x)
+  ret i32 %conv
 }
 
-define i32 @testmsll(x86_fp80 %x) nounwind {
-; CHECK-LABEL: testmsll:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp lroundl # TAILCALL
+define i32 @test_lround_i32_f80(x86_fp80 %x) nounwind {
+; X86-LABEL: test_lround_i32_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp lroundl # TAILCALL
+;
+; X64-LABEL: test_lround_i32_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp lroundl@PLT # TAILCALL
 ;
-; GISEL-X86-LABEL: testmsll:
-; GISEL-X86:       # %bb.0: # %entry
+; GISEL-X86-LABEL: test_lround_i32_f80:
+; GISEL-X86:       # %bb.0:
 ; GISEL-X86-NEXT:    subl $12, %esp
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fstpt (%esp)
@@ -82,19 +90,91 @@ define i32 @testmsll(x86_fp80 %x) nounwind {
 ; GISEL-X86-NEXT:    addl $12, %esp
 ; GISEL-X86-NEXT:    retl
 ;
-; X64-LABEL: testmsll:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    jmp lroundl@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsll:
-; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-LABEL: test_lround_i32_f80:
+; GISEL-X64:       # %bb.0:
 ; GISEL-X64-NEXT:    subq $24, %rsp
 ; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; GISEL-X64-NEXT:    fstpt (%rsp)
 ; GISEL-X64-NEXT:    callq lroundl
 ; GISEL-X64-NEXT:    addq $24, %rsp
 ; GISEL-X64-NEXT:    retq
-entry:
-  %0 = tail call i32 @llvm.lround.i32.f80(x86_fp80 %x)
-  ret i32 %0
+  %conv = tail call i32 @llvm.lround.i32.f80(x86_fp80 %x)
+  ret i32 %conv
 }
+
+define i32 @test_lround_i32_f128(fp128 %x) nounwind {
+; X86-LABEL: test_lround_i32_f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll lroundl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lround_i32_f128:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp lroundl@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_lround_i32_f128:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    subl $24, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    calll lroundf128
+; GISEL-X86-NEXT:    addl $24, %esp
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: test_lround_i32_f128:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq lroundf128
+; GISEL-X64-NEXT:    popq %rcx
+; GISEL-X64-NEXT:    retq
+  %conv = tail call i32 @llvm.lround.i32.f128(fp128 %x)
+  ret i32 %conv
+}
+
+; FIXME: not yet implemented in global isel
+; define i32 @test_lround_i32_f16_strict(half %x) nounwind strictfp {
+;   %conv = tail call i32 @llvm.experimental.constrained.lround.i32.f16(half %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i32 %conv
+; }
+
+; define i32 @test_lround_i32_f32_strict(float %x) nounwind strictfp {
+;   %conv = tail call i32 @llvm.experimental.constrained.lround.i32.f32(float %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i32 %conv
+; }
+
+; define i32 @test_lround_i32_f64_strict(double %x) nounwind strictfp {
+;   %conv = tail call i32 @llvm.experimental.constrained.lround.i32.f64(double %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i32 %conv
+; }
+
+; define i32 @test_lround_i32_f80_strict(x86_fp80 %x) nounwind strictfp {
+;   %conv = tail call i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i32 %conv
+; }
+
+; define i32 @test_lround_i32_f128_strict(fp128 %x) nounwind strictfp {
+;   %conv = tail call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i32 %conv
+; }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X86-NOSSE: {{.*}}
+; X86-SSE2: {{.*}}
diff --git a/llvm/test/CodeGen/X86/lround-conv-i64.ll b/llvm/test/CodeGen/X86/lround-conv-i64.ll
index 36b86f30ca13..8b8230074728 100644
--- a/llvm/test/CodeGen/X86/lround-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lround-conv-i64.ll
@@ -1,42 +1,86 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown             | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown            | FileCheck %s --check-prefixes=X64
 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
 
-define i64 @testmsxs(float %x) {
-; GISEL-X86-LABEL: testmsxs:
+; FIXME: crash
+; define i64 @test_lround_i64_f16(half %x) nounwind {
+; entry:
+;   %0 = tail call i64 @llvm.lround.i64.f16(half %x)
+;   ret i64 %0
+; }
+
+define i64 @test_lround_i64_f32(float %x) nounwind {
+; X86-NOSSE-LABEL: test_lround_i64_f32:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstps (%esp)
+; X86-NOSSE-NEXT:    calll lroundf
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lround_i64_f32:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll lroundf
+; X86-SSE2-NEXT:    popl %ecx
+; X86-SSE2-NEXT:    retl
+;
+; X64-LABEL: test_lround_i64_f32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp lroundf@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_lround_i64_f32:
 ; GISEL-X86:       # %bb.0: # %entry
 ; GISEL-X86-NEXT:    subl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl %eax, (%esp)
 ; GISEL-X86-NEXT:    calll lroundf
 ; GISEL-X86-NEXT:    addl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 4
 ; GISEL-X86-NEXT:    retl
 ;
-; CHECK-LABEL: testmsxs:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp lroundf@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsxs:
+; GISEL-X64-LABEL: test_lround_i64_f32:
 ; GISEL-X64:       # %bb.0: # %entry
 ; GISEL-X64-NEXT:    pushq %rax
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X64-NEXT:    callq lroundf
 ; GISEL-X64-NEXT:    popq %rcx
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 8
 ; GISEL-X64-NEXT:    retq
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   ret i64 %0
 }
 
-define i64 @testmsxd(double %x) {
-; GISEL-X86-LABEL: testmsxd:
+define i64 @test_lround_i64_f64(double %x) nounwind {
+; X86-NOSSE-LABEL: test_lround_i64_f64:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    subl $8, %esp
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fstpl (%esp)
+; X86-NOSSE-NEXT:    calll lround
+; X86-NOSSE-NEXT:    addl $8, %esp
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: test_lround_i64_f64:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE2-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE2-NEXT:    calll lround
+; X86-SSE2-NEXT:    addl $8, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X64-LABEL: test_lround_i64_f64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp lround@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_lround_i64_f64:
 ; GISEL-X86:       # %bb.0: # %entry
 ; GISEL-X86-NEXT:    subl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; GISEL-X86-NEXT:    movl 4(%eax), %eax
@@ -46,53 +90,131 @@ define i64 @testmsxd(double %x) {
 ; GISEL-X86-NEXT:    movl %eax, 4(%edx)
 ; GISEL-X86-NEXT:    calll lround
 ; GISEL-X86-NEXT:    addl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 4
 ; GISEL-X86-NEXT:    retl
 ;
-; CHECK-LABEL: testmsxd:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp lround@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsxd:
+; GISEL-X64-LABEL: test_lround_i64_f64:
 ; GISEL-X64:       # %bb.0: # %entry
 ; GISEL-X64-NEXT:    pushq %rax
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X64-NEXT:    callq lround
 ; GISEL-X64-NEXT:    popq %rcx
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 8
 ; GISEL-X64-NEXT:    retq
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   ret i64 %0
 }
 
-define i64 @testmsll(x86_fp80 %x) {
-; GISEL-X86-LABEL: testmsll:
+define i64 @test_lround_i64_f80(x86_fp80 %x) nounwind {
+; X86-LABEL: test_lround_i64_f80:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll lroundl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lround_i64_f80:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp lroundl@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_lround_i64_f80:
 ; GISEL-X86:       # %bb.0: # %entry
 ; GISEL-X86-NEXT:    subl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; GISEL-X86-NEXT:    fstpt (%esp)
 ; GISEL-X86-NEXT:    calll lroundl
 ; GISEL-X86-NEXT:    addl $12, %esp
-; GISEL-X86-NEXT:    .cfi_def_cfa_offset 4
 ; GISEL-X86-NEXT:    retl
 ;
-; CHECK-LABEL: testmsll:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    jmp lroundl@PLT # TAILCALL
-;
-; GISEL-X64-LABEL: testmsll:
+; GISEL-X64-LABEL: test_lround_i64_f80:
 ; GISEL-X64:       # %bb.0: # %entry
 ; GISEL-X64-NEXT:    subq $24, %rsp
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 32
 ; GISEL-X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; GISEL-X64-NEXT:    fstpt (%rsp)
 ; GISEL-X64-NEXT:    callq lroundl
 ; GISEL-X64-NEXT:    addq $24, %rsp
-; GISEL-X64-NEXT:    .cfi_def_cfa_offset 8
 ; GISEL-X64-NEXT:    retq
 entry:
   %0 = tail call i64 @llvm.lround.i64.f80(x86_fp80 %x)
   ret i64 %0
 }
+
+define i64 @test_lround_i64_f128(fp128 %x) nounwind {
+; X86-LABEL: test_lround_i64_f128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    pushl 20(%ebp)
+; X86-NEXT:    pushl 16(%ebp)
+; X86-NEXT:    pushl 12(%ebp)
+; X86-NEXT:    pushl 8(%ebp)
+; X86-NEXT:    calll lroundl
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_lround_i64_f128:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    jmp lroundl@PLT # TAILCALL
+;
+; GISEL-X86-LABEL: test_lround_i64_f128:
+; GISEL-X86:       # %bb.0: # %entry
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    subl $24, %esp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl %eax, (%esp)
+; GISEL-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT:    calll lroundf128
+; GISEL-X86-NEXT:    addl $24, %esp
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    retl
+;
+; GISEL-X64-LABEL: test_lround_i64_f128:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    pushq %rax
+; GISEL-X64-NEXT:    callq lroundf128
+; GISEL-X64-NEXT:    popq %rcx
+; GISEL-X64-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
+  ret i64 %0
+}
+
+; FIXME: not yet implemented in global isel
+; define i64 @test_lround_i64_f16_strict(half %x) nounwind strictfp {
+; entry:
+;   %0 = tail call i64 @llvm.experimental.constrained.lround.i64.f16(half %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %0
+; }
+
+; define i64 @test_lround_i64_f32_strict(float %x) nounwind strictfp {
+; entry:
+;   %0 = tail call i64 @llvm.experimental.constrained.lround.i64.f32(float %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %0
+; }
+
+; define i64 @test_lround_i64_f64_strict(double %x) nounwind strictfp {
+; entry:
+;   %0 = tail call i64 @llvm.experimental.constrained.lround.i64.f64(double %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %0
+; }
+
+; define i64 @test_lround_i64_f80_strict(x86_fp80 %x) nounwind strictfp {
+; entry:
+;   %0 = tail call i64 @llvm.experimental.constrained.lround.i64.f80(x86_fp80 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %0
+; }
+
+; define i64 @test_lround_i64_f128_strict(fp128 %x) nounwind strictfp {
+; entry:
+;   %0 = tail call i64 @llvm.experimental.constrained.lround.i64.f128(fp128 %x, metadata!"round.dynamic", metadata!"fpexcept.strict")
+;   ret i64 %0
+; }
diff --git a/llvm/test/CodeGen/X86/lvi-hardening-ret.ll b/llvm/test/CodeGen/X86/lvi-hardening-ret.ll
index faa8bff8f094..954985a3798b 100644
--- a/llvm/test/CodeGen/X86/lvi-hardening-ret.ll
+++ b/llvm/test/CodeGen/X86/lvi-hardening-ret.ll
@@ -41,9 +41,9 @@ entry:
   %add = add nsw i32 %0, %1
   ret i32 %add
 ; CHECK-NOT:  retq
-; CHECK:      popq %rcx
+; CHECK:      popq %rsi
 ; CHECK-NEXT: lfence
-; CHECK-NEXT: jmpq *%rcx
+; CHECK-NEXT: jmpq *%rsi
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
@@ -52,9 +52,9 @@ define dso_local preserve_mostcc void @preserve_most() #0 {
 entry:
   ret void
 ; CHECK-NOT:  retq
-; CHECK:      popq %rax
+; CHECK:      popq %r11
 ; CHECK-NEXT: lfence
-; CHECK-NEXT: jmpq *%rax
+; CHECK-NEXT: jmpq *%r11
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
@@ -63,9 +63,9 @@ define dso_local preserve_allcc void @preserve_all() #0 {
 entry:
   ret void
 ; CHECK-NOT:  retq
-; CHECK:      popq %rax
+; CHECK:      popq %r11
 ; CHECK-NEXT: lfence
-; CHECK-NEXT: jmpq *%rax
+; CHECK-NEXT: jmpq *%r11
 }
 
 define { i64, i128 } @ret_i64_i128() #0 {
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index fb2433dbbb1e..7c9adaf31aff 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -730,36 +730,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -767,20 +767,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX2-LABEL: vec256_i64_signed_mem_reg:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm4
 ; AVX2-NEXT:    vpsrlq $33, %ymm0, %ymm0
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
@@ -790,36 +790,36 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
 ; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; XOP-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm5
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; XOP-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; XOP-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; XOP-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
@@ -900,36 +900,36 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm9, %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm3, %xmm3
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -937,20 +937,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX2-LABEL: vec256_i64_signed_reg_mem:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm4
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -960,36 +960,36 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtq %xmm2, %xmm0, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm3, %xmm1, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm2, %xmm0, %xmm5
 ; XOP-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; XOP-NEXT:    vpcomgtq %xmm3, %xmm1, %xmm5
+; XOP-NEXT:    vpxor %xmm5, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
-; XOP-NEXT:    vpxor %xmm5, %xmm3, %xmm3
-; XOP-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpxor %xmm4, %xmm3, %xmm3
+; XOP-NEXT:    vpsubq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpsrlq $1, %xmm3, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm2, %xmm2
 ; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; XOP-NEXT:    vpmuludq %xmm2, %xmm9, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; XOP-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
-; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; XOP-NEXT:    vpsrlq $33, %xmm3, %xmm3
-; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; XOP-NEXT:    vpmuludq %xmm7, %xmm3, %xmm3
-; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpsllq $32, %xmm3, %xmm3
-; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
@@ -1071,36 +1071,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; AVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; AVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1109,20 +1109,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm4
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -1133,36 +1133,36 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
+; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
 ; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; XOP-NEXT:    vpxor %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm0, %xmm4, %xmm0
-; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm5
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
-; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm9
+; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm9
 ; XOP-NEXT:    vpmuludq %xmm0, %xmm9, %xmm0
-; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
-; XOP-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
+; XOP-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
+; XOP-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
-; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm4
+; XOP-NEXT:    vpmuludq %xmm7, %xmm9, %xmm5
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
-; XOP-NEXT:    vpor %xmm5, %xmm8, %xmm7
+; XOP-NEXT:    vpor %xmm4, %xmm8, %xmm7
 ; XOP-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm5, %xmm5
-; XOP-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsrlq $32, %xmm4, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
-; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm5
-; XOP-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-NEXT:    vpmuludq %xmm7, %xmm6, %xmm4
+; XOP-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; XOP-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; XOP-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/movrs-avx10.2-512-intrinsics.ll b/llvm/test/CodeGen/X86/movrs-avx10.2-512-intrinsics.ll
index a730ef519c01..a478577155f1 100644
--- a/llvm/test/CodeGen/X86/movrs-avx10.2-512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/movrs-avx10.2-512-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+movrs,+avx10.2-512 -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+movrs,+avx10.2 -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK
 
 declare <64 x i8> @llvm.x86.avx10.vmovrsb512(ptr)
 declare <16 x i32> @llvm.x86.avx10.vmovrsd512(ptr)
diff --git a/llvm/test/CodeGen/X86/movrs-avx10.2-intrinsics.ll b/llvm/test/CodeGen/X86/movrs-avx10.2-intrinsics.ll
index 583e16351652..62613d773a36 100644
--- a/llvm/test/CodeGen/X86/movrs-avx10.2-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/movrs-avx10.2-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+movrs,+avx10.2-256 -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+movrs,+avx10.2 -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK
 
 define <2 x i64> @test_mm_movrsb_epu8(ptr %__A) {
 ; CHECK-LABEL: test_mm_movrsb_epu8:
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 9e398096bfcc..693d1992091b 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -93,10 +93,8 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2-NEXT:    psrld $1, %xmm0
 ; SSE2-NEXT:    pslld $31, %xmm3
 ; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [715827883,715827883,715827883,715827883]
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: p4_vector_urem_by_const__splat:
@@ -104,9 +102,9 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    psrld $1, %xmm0
-; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
-; SSE4-NEXT:    pminud %xmm0, %xmm1
-; SSE4-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
+; SSE4-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX2-LABEL: p4_vector_urem_by_const__splat:
@@ -116,9 +114,8 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) {
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
 ; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpsrld $1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882]
-; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [715827883,715827883,715827883,715827883]
+; AVX2-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
   %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 128, i32 128> ; clearly a power-of-two or zero
   %t1 = urem <4 x i32> %t0, <i32 6, i32 6, i32 6, i32 6> ; '6' is clearly not a power of two
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 8d155bd57df1..1e3204dfc999 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -16,9 +16,9 @@
 ; CHECK-NEXT: Target Pass Configuration
 ; CHECK-NEXT: Machine Module Information
 ; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
 ; CHECK-NEXT: Machine Branch Probability Analysis
diff --git a/llvm/test/CodeGen/X86/peep-test-5.ll b/llvm/test/CodeGen/X86/peep-test-5.ll
index 52bcbe9f83d7..a4af93b81023 100644
--- a/llvm/test/CodeGen/X86/peep-test-5.ll
+++ b/llvm/test/CodeGen/X86/peep-test-5.ll
@@ -51,3 +51,54 @@ end:
 }
 
 declare void @free_object()
+
+; Check TEST instruction would not be combined with CMP.
+define i1 @pr155586(i8 %0) {
+; CHECK-LABEL: pr155586:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+entry:
+  %cmp88.not = icmp eq i8 %0, 1
+  %1 = and i8 %0, 1
+  %tobool161.not = icmp eq i8 %1, 0
+  %common.ret.op = select i1 %cmp88.not, i1 false, i1 %tobool161.not
+  ret i1 %common.ret.op
+}
+
+; Check TEST8rr instruction would not be combined with TEST8ri.
+define i32 @pr155828() {
+; CHECK-LABEL: pr155828:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB2_1: # %func_188.exit.i.i
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %if.else.i.i.i
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    setg %al
+; CHECK-NEXT:    retq
+entry:
+  br label %func_188.exit.i.i
+
+func_188.exit.i.i:                                ; preds = %func_188.exit.i.i, %entry
+  %or659.i167180.i.i = phi i32 [ 0, %entry ], [ 1, %func_188.exit.i.i ]
+  %conv48.i.i = trunc i32 %or659.i167180.i.i to i8
+  %and.i.i.i = and i32 %or659.i167180.i.i, 1
+  %tobool80.not.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool80.not.i.i.i, label %if.else.i.i.i, label %func_188.exit.i.i
+
+if.else.i.i.i:                                    ; preds = %func_188.exit.i.i
+  %cmp183.i.i.i = icmp sgt i8 %conv48.i.i, 0
+  %ext = zext i1 %cmp183.i.i.i to i32
+  ret i32 %ext
+}
diff --git a/llvm/test/CodeGen/X86/pr156256.ll b/llvm/test/CodeGen/X86/pr156256.ll
new file mode 100644
index 000000000000..13caa6fee587
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr156256.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+define <16 x i16> @PR156256(<16 x i32> %a, <16 x i32> %b) {
+; AVX512-LABEL: PR156256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: PR156256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512VL-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %icmp = icmp ugt <16 x i32> %a, %b
+  %sext = sext <16 x i1> %icmp to <16 x i16>
+  %and = and <16 x i16> %sext, splat (i16 16256)
+  ret <16 x i16> %and
+}
diff --git a/llvm/test/CodeGen/X86/pr156817.ll b/llvm/test/CodeGen/X86/pr156817.ll
new file mode 100644
index 000000000000..80972ecc5abb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr156817.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64 -mattr=+egpr | FileCheck %s --check-prefix=EGPR
+
+define coldcc i32 @foo() nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: foo:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    pushq %rax
+; EGPR-NEXT:    callq bar@PLT
+; EGPR-NEXT:    popq %r16
+; EGPR-NEXT:    retq
+  %1 = tail call coldcc i32 @bar()
+  ret i32 %1
+}
+
+declare coldcc i32 @bar()
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index c3c96e822879..6a0c13526ac1 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -260,7 +260,6 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    .cfi_offset %ebx, -8
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:    jmp .LBB1_1
@@ -272,10 +271,9 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl %edx, %ebx
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:  .LBB1_1: # %for.cond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    je .LBB1_3
 ; CHECK-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    xorl %eax, %eax
@@ -283,12 +281,11 @@ define void @verifier_error_reduced_issue38788(i1 %cmp11) {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_3: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    je .LBB1_4
 ; CHECK-NEXT:  # %bb.9: # %if.then13
 ; CHECK-NEXT:    # in Loop: Header=BB1_1 Depth=1
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    movl $0, %ebx
 ; CHECK-NEXT:    jne .LBB1_8
diff --git a/llvm/test/CodeGen/X86/pr40289-64bit.ll b/llvm/test/CodeGen/X86/pr40289-64bit.ll
index 58da5258a670..96c8377eb0f0 100644
--- a/llvm/test/CodeGen/X86/pr40289-64bit.ll
+++ b/llvm/test/CodeGen/X86/pr40289-64bit.ll
@@ -6,5 +6,5 @@ define cc 92 < 9 x i64 > @clobber() {
   ret < 9 x i64 > undef
   ; CHECK-LABEL: clobber:
   ; CHECK-NOT: popq %rsp
-  ; CHECK: addq $8, %rsp
+  ; CHECK: popq %rax
 }
diff --git a/llvm/test/CodeGen/X86/pr40289.ll b/llvm/test/CodeGen/X86/pr40289.ll
index 851b23c002bd..21e50931b40f 100644
--- a/llvm/test/CodeGen/X86/pr40289.ll
+++ b/llvm/test/CodeGen/X86/pr40289.ll
@@ -6,5 +6,5 @@ define < 3 x i32 > @clobber() {
   ret < 3 x i32 > undef
   ; CHECK-LABEL: clobber:
   ; CHECK-NOT: popl %esp
-  ; CHECK: addl $4, %esp
+  ; CHECK: popl %eax
 }
diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll
index 946380971988..accdd04f084d 100644
--- a/llvm/test/CodeGen/X86/pr67333.ll
+++ b/llvm/test/CodeGen/X86/pr67333.ll
@@ -7,19 +7,25 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
 define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 {
 ; CHECK-LABEL: SHA256_Compress_Generic:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movbel 0, %eax
-; CHECK-NEXT:    movbel 12(%rdi), %ecx
+; CHECK-NEXT:    movl 0, %eax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movl 12(%rdi), %ecx
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovd %eax, %xmm0
 ; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128]
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vpsrld $17, %xmm2, %xmm0
 ; CHECK-NEXT:    vpslld $15, %xmm2, %xmm3
-; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm3
+; CHECK-NEXT:    vpor %xmm0, %xmm3, %xmm3
+; CHECK-NEXT:    vpsrld $19, %xmm2, %xmm0
 ; CHECK-NEXT:    vpslld $13, %xmm2, %xmm4
-; CHECK-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm3
-; CHECK-NEXT:    vpxor %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    vpor %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpxor %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %ecx, %xmm4
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr90844.ll b/llvm/test/CodeGen/X86/pr90844.ll
deleted file mode 100644
index b250c3f6f9a2..000000000000
--- a/llvm/test/CodeGen/X86/pr90844.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-evex512 < %s | FileCheck %s
-
-define void @PR90844() {
-; CHECK-LABEL: PR90844:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, (%rax)
-; CHECK-NEXT:    retq
-entry:
-  %0 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> poison, <2 x i32> poison, <2 x i32> <i32 8, i32 24>)
-  %1 = and <2 x i32> %0, <i32 16711935, i32 -134152448>
-  %2 = or disjoint <2 x i32> zeroinitializer, %1
-  %3 = zext <2 x i32> %2 to <2 x i64>
-  %4 = shl nuw <2 x i64> %3, <i64 32, i64 32>
-  %5 = or disjoint <2 x i64> %4, zeroinitializer
-  store <2 x i64> %5, ptr poison, align 16
-  ret void
-}
-
-define void @foo(ptr %0) {
-; CHECK-LABEL: foo:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; CHECK-NEXT:    vpxor 32(%rdi), %ymm0, %ymm1
-; CHECK-NEXT:    vpxor (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    vmovdqa %ymm0, (%rdi)
-; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-entry:
-  %1 = load <32 x half>, ptr %0
-  %2 = fneg <32 x half> %1
-  store <32 x half> %2, ptr %0
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 9323cd5b1917..7462c7748282 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -938,3 +938,206 @@ define i128 @lshr_shl_mask(i128 %a0) {
   %2 = lshr i128 %1, 1
   ret i128 %2
 }
+
+define i128 @shift_i128_limited_shamt(i128 noundef %a, i32 noundef %b) nounwind {
+; i686-LABEL: shift_i128_limited_shamt:
+; i686:       # %bb.0: # %start
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    pushl %ebx
+; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $16, %esp
+; i686-NEXT:    movl 32(%ebp), %ebx
+; i686-NEXT:    movl 28(%ebp), %edi
+; i686-NEXT:    movzbl 40(%ebp), %ecx
+; i686-NEXT:    movb $6, %dl
+; i686-NEXT:    subb %cl, %dl
+; i686-NEXT:    addb $-7, %cl
+; i686-NEXT:    movl %edi, %eax
+; i686-NEXT:    shrl %eax
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shll %cl, %ebx
+; i686-NEXT:    orl %eax, %ebx
+; i686-NEXT:    movl 24(%ebp), %esi
+; i686-NEXT:    movl %esi, %eax
+; i686-NEXT:    shll %cl, %eax
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 8(%ebp), %edi
+; i686-NEXT:    movl 36(%ebp), %esi
+; i686-NEXT:    movl 32(%ebp), %edx
+; i686-NEXT:    shldl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, 12(%edi)
+; i686-NEXT:    movl %ebx, 8(%edi)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%edi)
+; i686-NEXT:    movl %eax, (%edi)
+; i686-NEXT:    movl %edi, %eax
+; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    popl %esi
+; i686-NEXT:    popl %edi
+; i686-NEXT:    popl %ebx
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl $4
+;
+; x86_64-LABEL: shift_i128_limited_shamt:
+; x86_64:       # %bb.0: # %start
+; x86_64-NEXT:    movq %rdi, %rax
+; x86_64-NEXT:    movb $6, %cl
+; x86_64-NEXT:    subb %dl, %cl
+; x86_64-NEXT:    shldq %cl, %rdi, %rsi
+; x86_64-NEXT:    shlq %cl, %rax
+; x86_64-NEXT:    movq %rsi, %rdx
+; x86_64-NEXT:    retq
+start:
+  %shamt = sub nuw nsw i32 6, %b
+  %ext = zext nneg i32 %shamt to i128
+  %res = shl i128 %a, %ext
+  ret i128 %res
+}
+
+define i128 @shift_i128_limited_shamt_no_nuw(i128 noundef %a, i32 noundef %b) nounwind {
+; i686-LABEL: shift_i128_limited_shamt_no_nuw:
+; i686:       # %bb.0: # %start
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    pushl %ebx
+; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movzbl 40(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
+; i686-NEXT:    movl 36(%ebp), %edi
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movb $6, %cl
+; i686-NEXT:    subb %al, %cl
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    negb %al
+; i686-NEXT:    movsbl %al, %eax
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, (%esp)
+; i686-NEXT:    movl 20(%esp,%eax), %edx
+; i686-NEXT:    movl 24(%esp,%eax), %ebx
+; i686-NEXT:    movl %ebx, %edi
+; i686-NEXT:    shldl %cl, %edx, %edi
+; i686-NEXT:    movl 16(%esp,%eax), %esi
+; i686-NEXT:    movl 28(%esp,%eax), %eax
+; i686-NEXT:    shldl %cl, %ebx, %eax
+; i686-NEXT:    movl 8(%ebp), %ebx
+; i686-NEXT:    movl %eax, 12(%ebx)
+; i686-NEXT:    movl %edi, 8(%ebx)
+; i686-NEXT:    movl %esi, %eax
+; i686-NEXT:    shll %cl, %eax
+; i686-NEXT:    shldl %cl, %esi, %edx
+; i686-NEXT:    movl %edx, 4(%ebx)
+; i686-NEXT:    movl %eax, (%ebx)
+; i686-NEXT:    movl %ebx, %eax
+; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    popl %esi
+; i686-NEXT:    popl %edi
+; i686-NEXT:    popl %ebx
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl $4
+;
+; x86_64-LABEL: shift_i128_limited_shamt_no_nuw:
+; x86_64:       # %bb.0: # %start
+; x86_64-NEXT:    movb $6, %cl
+; x86_64-NEXT:    subb %dl, %cl
+; x86_64-NEXT:    shldq %cl, %rdi, %rsi
+; x86_64-NEXT:    shlq %cl, %rdi
+; x86_64-NEXT:    xorl %eax, %eax
+; x86_64-NEXT:    testb $64, %cl
+; x86_64-NEXT:    cmovneq %rdi, %rsi
+; x86_64-NEXT:    cmoveq %rdi, %rax
+; x86_64-NEXT:    movq %rsi, %rdx
+; x86_64-NEXT:    retq
+start:
+  %shamt = sub nsw i32 6, %b
+  %ext = zext nneg i32 %shamt to i128
+  %res = shl i128 %a, %ext
+  ret i128 %res
+}
+
+define i128 @shift_i128_limited_shamt_unknown_lhs(i128 noundef %a, i32 noundef %b, i32 noundef %c) nounwind {
+; i686-LABEL: shift_i128_limited_shamt_unknown_lhs:
+; i686:       # %bb.0: # %start
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    pushl %ebx
+; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $48, %esp
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
+; i686-NEXT:    movl 36(%ebp), %edi
+; i686-NEXT:    movl 44(%ebp), %ecx
+; i686-NEXT:    subl 40(%ebp), %ecx
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl $0, (%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrb $3, %al
+; i686-NEXT:    andb $12, %al
+; i686-NEXT:    negb %al
+; i686-NEXT:    movsbl %al, %eax
+; i686-NEXT:    movl 20(%esp,%eax), %edx
+; i686-NEXT:    movl 24(%esp,%eax), %ebx
+; i686-NEXT:    movl %ebx, %edi
+; i686-NEXT:    shldl %cl, %edx, %edi
+; i686-NEXT:    movl 16(%esp,%eax), %esi
+; i686-NEXT:    movl 28(%esp,%eax), %eax
+; i686-NEXT:    shldl %cl, %ebx, %eax
+; i686-NEXT:    movl 8(%ebp), %ebx
+; i686-NEXT:    movl %eax, 12(%ebx)
+; i686-NEXT:    movl %edi, 8(%ebx)
+; i686-NEXT:    movl %esi, %eax
+; i686-NEXT:    shll %cl, %eax
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %esi, %edx
+; i686-NEXT:    movl %edx, 4(%ebx)
+; i686-NEXT:    movl %eax, (%ebx)
+; i686-NEXT:    movl %ebx, %eax
+; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    popl %esi
+; i686-NEXT:    popl %edi
+; i686-NEXT:    popl %ebx
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl $4
+;
+; x86_64-LABEL: shift_i128_limited_shamt_unknown_lhs:
+; x86_64:       # %bb.0: # %start
+; x86_64-NEXT:    subl %edx, %ecx
+; x86_64-NEXT:    shldq %cl, %rdi, %rsi
+; x86_64-NEXT:    shlq %cl, %rdi
+; x86_64-NEXT:    xorl %eax, %eax
+; x86_64-NEXT:    testb $64, %cl
+; x86_64-NEXT:    cmovneq %rdi, %rsi
+; x86_64-NEXT:    cmoveq %rdi, %rax
+; x86_64-NEXT:    movq %rsi, %rdx
+; x86_64-NEXT:    retq
+start:
+  %shamt = sub nuw nsw i32 %c, %b
+  %ext = zext nneg i32 %shamt to i128
+  %res = shl i128 %a, %ext
+  ret i128 %res
+}
diff --git a/llvm/test/CodeGen/X86/sm4-evex-intrinsics.ll b/llvm/test/CodeGen/X86/sm4-evex-intrinsics.ll
index 825a11d66cd4..8d99ad07e22e 100644
--- a/llvm/test/CodeGen/X86/sm4-evex-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/sm4-evex-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-- --show-mc-encoding -mattr=+sm4,+avx10.2-512 | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=i686-- --show-mc-encoding -mattr=+sm4,+avx10.2-512 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-- --show-mc-encoding -mattr=+sm4,+avx10.2 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-- --show-mc-encoding -mattr=+sm4,+avx10.2 | FileCheck %s
 
 define <4 x i32> @test_int_x86_vsm4key4128(<4 x i32> %A, <4 x i32> %B) {
 ; CHECK-LABEL: test_int_x86_vsm4key4128:
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index 4b0f63f9a638..cd576b19f876 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -8,10 +8,10 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd:
@@ -125,7 +125,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -135,11 +135,11 @@ define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusd_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -150,11 +150,11 @@ define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpbusd %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -164,11 +164,11 @@ define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -179,11 +179,11 @@ define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    {vex} vpdpbusd %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -193,11 +193,11 @@ define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -208,11 +208,11 @@ define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpbusds %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -222,11 +222,11 @@ define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -237,6 +237,6 @@ define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    {vex} vpdpbusds %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index 35688e59fc9f..766ccdbada53 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -79,7 +79,7 @@ define <8 x half> @f11(<2 x double> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <2 x double> %a0, i32 0
   %cvt = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %ext,
@@ -140,7 +140,7 @@ define <8 x half> @f17(<4 x float> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vmovsh {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <4 x float> %a0, i32 0
   %cvt = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %ext,
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 6b8a03ba5eb7..762900e0bb18 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -9,8 +9,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1 | FileCheck %s --check-prefixes=AVX512VLVBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
 
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 6fbc10307e0b..0b98a9388adc 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -6,9 +6,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX10,AVX512VLVBMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX10,AVX10_256
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX10,AVX512VLVBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1 | FileCheck %s --check-prefixes=AVX512VLVBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 
@@ -118,10 +117,10 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: var_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -273,10 +272,10 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: var_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -426,10 +425,10 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: var_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -680,34 +679,6 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
-; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VLVBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
-; AVX512VLVBMI2-NEXT:    vpermi2b %zmm0, %zmm1, %zmm3
-; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0
-; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT:    vpsllvw %zmm0, %zmm3, %zmm0
-; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT:    retq
-;
-; AVX10_256-LABEL: var_funnnel_v32i8:
-; AVX10_256:       # %bb.0:
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10_256-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
-; AVX10_256-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
-; AVX10_256-NEXT:    vpsllvw %ymm5, %ymm3, %ymm3
-; AVX10_256-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
-; AVX10_256-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
-; AVX10_256-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX10_256-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
-; AVX10_256-NEXT:    retq
-;
 ; XOPAVX1-LABEL: var_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
@@ -840,11 +811,11 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX10-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -957,11 +928,11 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VLBW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX10-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -1078,11 +1049,11 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX10-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -1212,17 +1183,17 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VLBW-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v32i8:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX10-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
-; AVX10-NEXT:    vpsrlw $8, %ymm3, %ymm3
-; AVX10-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
-; AVX10-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX10-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512VLVBMI2-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
@@ -1452,25 +1423,25 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
 ; AVX512VLBW-NEXT:    vzeroupper
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: fancierRotate2:
-; AVX10:       # %bb.0: # %entry
-; AVX10-NEXT:    vpbroadcastd %edx, %ymm0
-; AVX10-NEXT:    vpbroadcastd %ecx, %ymm1
-; AVX10-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; AVX10-NEXT:    .p2align 4
-; AVX10-NEXT:  .LBB8_1: # %loop
-; AVX10-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX10-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX10-NEXT:    vptestnmb %xmm2, %xmm2, %k1
-; AVX10-NEXT:    vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
-; AVX10-NEXT:    vmovdqu 4096(%rdi,%rax,4), %ymm3
-; AVX10-NEXT:    vprolvd %ymm2, %ymm3, %ymm2
-; AVX10-NEXT:    vmovdqu %ymm2, 4096(%rdi,%rax,4)
-; AVX10-NEXT:    addq $8, %rax
-; AVX10-NEXT:    jne .LBB8_1
-; AVX10-NEXT:  # %bb.2: # %exit
-; AVX10-NEXT:    vzeroupper
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: fancierRotate2:
+; AVX512VLVBMI2:       # %bb.0: # %entry
+; AVX512VLVBMI2-NEXT:    vpbroadcastd %edx, %ymm0
+; AVX512VLVBMI2-NEXT:    vpbroadcastd %ecx, %ymm1
+; AVX512VLVBMI2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; AVX512VLVBMI2-NEXT:    .p2align 4
+; AVX512VLVBMI2-NEXT:  .LBB8_1: # %loop
+; AVX512VLVBMI2-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX512VLVBMI2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512VLVBMI2-NEXT:    vptestnmb %xmm2, %xmm2, %k1
+; AVX512VLVBMI2-NEXT:    vpblendmd %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512VLVBMI2-NEXT:    vmovdqu 4096(%rdi,%rax,4), %ymm3
+; AVX512VLVBMI2-NEXT:    vprolvd %ymm2, %ymm3, %ymm2
+; AVX512VLVBMI2-NEXT:    vmovdqu %ymm2, 4096(%rdi,%rax,4)
+; AVX512VLVBMI2-NEXT:    addq $8, %rax
+; AVX512VLVBMI2-NEXT:    jne .LBB8_1
+; AVX512VLVBMI2-NEXT:  # %bb.2: # %exit
+; AVX512VLVBMI2-NEXT:    vzeroupper
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: fancierRotate2:
 ; XOPAVX1:       # %bb.0: # %entry
@@ -1623,10 +1594,10 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: constant_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -1721,10 +1692,10 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: constant_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -1824,10 +1795,10 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: constant_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -1947,28 +1918,6 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
-; AVX512VLVBMI2:       # %bb.0:
-; AVX512VLVBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VLVBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VLVBMI2-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
-; AVX512VLVBMI2-NEXT:    vpermi2b %zmm0, %zmm1, %zmm2
-; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT:    retq
-;
-; AVX10_256-LABEL: constant_funnnel_v32i8:
-; AVX10_256:       # %bb.0:
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10_256-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX10_256-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10_256-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX10_256-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX10_256-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; AVX10_256-NEXT:    retq
-;
 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -2069,10 +2018,10 @@ define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwi
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldq $14, %ymm1, %ymm0, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldq $14, %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -2154,10 +2103,10 @@ define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwi
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldd $4, %ymm1, %ymm0, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldd $4, %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -2239,10 +2188,10 @@ define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) no
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -2330,12 +2279,12 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v32i8:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index bf525442a419..20be5791309f 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -9,8 +9,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1 | FileCheck %s --check-prefixes=AVX512VLVBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
 
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index b0a1a91bdccc..1f164635910c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -6,9 +6,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX10,AVX512VLVBMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX10,AVX10_256
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX10,AVX512VLVBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1 | FileCheck %s --check-prefixes=AVX512VLVBMI2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
 
@@ -118,11 +117,11 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt)
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: var_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdvq %ymm2, %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdvq %ymm2, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -274,11 +273,11 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt)
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: var_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdvd %ymm2, %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdvd %ymm2, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -454,11 +453,11 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: var_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdvw %ymm2, %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdvw %ymm2, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: var_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -720,20 +719,6 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt)
 ; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; AVX10_256-LABEL: var_funnnel_v32i8:
-; AVX10_256:       # %bb.0:
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10_256-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
-; AVX10_256-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
-; AVX10_256-NEXT:    vpsrlvw %ymm5, %ymm3, %ymm3
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
-; AVX10_256-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm1
-; AVX10_256-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX10_256-NEXT:    vpermi2b %ymm3, %ymm1, %ymm0
-; AVX10_256-NEXT:    retq
-;
 ; XOPAVX1-LABEL: var_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
@@ -870,12 +855,12 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX10-NEXT:    vpshrdvq %ymm2, %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpshrdvq %ymm2, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -988,12 +973,12 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX512VLBW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpbroadcastd %xmm2, %ymm2
-; AVX10-NEXT:    vpshrdvd %ymm2, %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpshrdvd %ymm2, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -1110,12 +1095,12 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatvar_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpbroadcastw %xmm2, %ymm2
-; AVX10-NEXT:    vpshrdvw %ymm2, %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm2, %ymm2
+; AVX512VLVBMI2-NEXT:    vpshrdvw %ymm2, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -1265,17 +1250,6 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
 ; AVX512VLVBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; AVX10_256-LABEL: splatvar_funnnel_v32i8:
-; AVX10_256:       # %bb.0:
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10_256-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX10_256-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10_256-NEXT:    vpsrlw %xmm2, %ymm0, %ymm1
-; AVX10_256-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX10_256-NEXT:    vpermi2b %ymm3, %ymm1, %ymm0
-; AVX10_256-NEXT:    retq
-;
 ; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -1388,11 +1362,11 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: constant_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -1487,11 +1461,11 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: constant_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -1591,11 +1565,11 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwin
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: constant_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; AVX10-NEXT:    vmovdqa %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: constant_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -1761,16 +1735,6 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512VLVBMI2-NEXT:    retq
 ;
-; AVX10_256-LABEL: constant_funnnel_v32i8:
-; AVX10_256:       # %bb.0:
-; AVX10_256-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10_256-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX10_256-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10_256-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; AVX10_256-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX10_256-NEXT:    vpermi2b %ymm2, %ymm1, %ymm0
-; AVX10_256-NEXT:    retq
-;
 ; XOPAVX1-LABEL: constant_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -1869,10 +1833,10 @@ define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwi
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v4i64:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdq $14, %ymm0, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdq $14, %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
 ; XOPAVX1:       # %bb.0:
@@ -1954,10 +1918,10 @@ define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwi
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v8i32:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdd $4, %ymm0, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdd $4, %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
 ; XOPAVX1:       # %bb.0:
@@ -2039,10 +2003,10 @@ define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) no
 ; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v16i16:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpshrdw $7, %ymm0, %ymm1, %ymm0
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpshrdw $7, %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
@@ -2130,12 +2094,12 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
-; AVX10-LABEL: splatconstant_funnnel_v32i8:
-; AVX10:       # %bb.0:
-; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
-; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
-; AVX10-NEXT:    retq
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
+; AVX512VLVBMI2:       # %bb.0:
+; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
+; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
+; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
 ; XOPAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vectorization-remarks-loopid-dbg.ll b/llvm/test/CodeGen/X86/vectorization-remarks-loopid-dbg.ll
new file mode 100644
index 000000000000..31949403b446
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vectorization-remarks-loopid-dbg.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, ptr %diff, align 4, !tbaa !11
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [16 x i8], ptr %cb, i64 0, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1, !tbaa !21
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [16 x i8], ptr %cc, i64 0, i64 %indvars.iv
+  %1 = load i8, ptr %arrayidx2, align 1, !tbaa !21
+  %conv3 = sext i8 %1 to i32
+  %sub = sub i32 %conv, %conv3
+  %add = add nsw i32 %sub, %add8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !25
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, ptr %diff, align 4, !tbaa !11
+  call void @ibar(ptr %diff) #2
+  ret i32 0
+}
+
+declare void @ibar(ptr) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
+!25 = !{!25, !15}
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
index de76d90bf6b6..474b77665867 100644
--- a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
@@ -106,7 +106,7 @@ body:             |
 # RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
 # RUN:    --check-prefix=BESTEFFORT
 # DEALLOC-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_after_epilog':
-# DEALLOC-AFTER-EPILOG-SAME: Unexpected lea, mov or add instruction after the epilog
+# DEALLOC-AFTER-EPILOG-SAME: Unexpected lea or add instruction after the epilog
 
 --- |
   define dso_local void @dealloc_after_epilog() local_unnamed_addr {
@@ -161,6 +161,135 @@ body:             |
     RET64
 ...
 
+;--- mov_no_setframe.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/mov_no_setframe.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=MOV-NO-SETFRAME
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/mov_no_setframe.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MOV-NO-SETFRAME: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'mov_no_setframe':
+# MOV-NO-SETFRAME-SAME: The epilog is setting frame back, but prolog did not set it
+
+--- |
+  define dso_local void @mov_no_setframe() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            mov_no_setframe
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = MOV64rr $rbp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- mov_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/mov_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=MOV-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/mov_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
+# RUN:    --check-prefix=BESTEFFORT
+# MOV-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'mov_after_epilog':
+# MOV-AFTER-EPILOG-SAME: Unexpected mov instruction after the epilog
+
+--- |
+  define dso_local void @mov_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            mov_after_epilog
+body:             |
+  bb.0.entry:
+    $rbp = MOV64rr $rsp
+    frame-setup SEH_SetFrame 52, 0
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rsp = MOV64rr $rbp
+    RET64
+...
+
+;--- pop_before_mov.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_before_mov.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-BEFORE-MOV
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_mov.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-BEFORE-MOV: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_mov':
+# POP-BEFORE-MOV-SAME: The epilog is setting the frame back after popping registers
+
+--- |
+  define dso_local void @pop_before_mov() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_before_mov
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    $rbp = MOV64rr $rsp
+    frame-setup SEH_SetFrame 52, 0
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsp = MOV64rr $rbp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- mov_after_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/mov_after_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=MOV-AFTER-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/mov_after_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MOV-AFTER-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'mov_after_dealloc':
+# MOV-AFTER-DEALLOC-SAME: Cannot set the frame back after the stack allocation has been deallocated
+
+--- |
+  define dso_local void @mov_after_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            mov_after_dealloc
+body:             |
+  bb.0.entry:
+    $rbp = MOV64rr $rsp
+    frame-setup SEH_SetFrame 52, 0
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rsp = MOV64rr $rbp
+    SEH_EndEpilogue
+    RET64
+...
+
 ;--- too_many_pops.mir
 # RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
 # RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll b/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll
index 326127a919f3..0d92d044e1b9 100644
--- a/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2.ll
@@ -171,9 +171,44 @@ define dso_local void @large_aligned_alloc() align 16 {
 ; CHECK-NEXT:   retq
 ; CHECK-NEXT:   .seh_endproc
 
+define dso_local void @set_frame_only() local_unnamed_addr {
+  tail call i64 @llvm.x86.flags.read.u64()
+  ret void
+}
+
+; CHECK-LABEL:  set_frame_only:
+; CHECK:        .seh_unwindversion 2
+; CHECK:        .seh_pushreg %rbp
+; CHECK:        .seh_setframe %rbp, 0
+; CHECK:        .seh_endprologue
+; CHECK-NOT:    .seh_endproc
+; CHECK:        .seh_startepilogue
+; CHECK-NEXT:   .seh_unwindv2start
+; CHECK-NEXT:   popq    %rbp
+; CHECK-NEXT:   .seh_endepilogue
+; CHECK-NEXT:   retq
+; CHECK-NEXT:   .seh_endproc
+
+attributes #1 = { noreturn }
+define dso_local void @no_return_func() local_unnamed_addr #1 {
+entry:
+  call void @d()
+  unreachable
+}
+; CHECK-LABEL:  no_return_func:
+; CHECK-NOT:    .seh_unwindversion 2
+; CHECK:        .seh_stackalloc
+; CHECK-NEXT:   .seh_endprologue
+; CHECK-NOT:    .seh_startepilogue
+; CHECK-NOT:    .seh_unwindv2start
+; CHECK:        int3
+; CHECK-NEXT:   .seh_endproc
+
+declare i64 @llvm.x86.flags.read.u64()
 declare void @a() local_unnamed_addr
 declare i32 @b() local_unnamed_addr
 declare i32 @c(i32) local_unnamed_addr
+declare void @d() local_unnamed_addr #1
 
 !llvm.module.flags = !{!0}
-!0 = !{i32 1, !"winx64-eh-unwindv2", i32 1}
+!0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
diff --git a/llvm/test/CodeGen/X86/xor-not-combine.ll b/llvm/test/CodeGen/X86/xor-not-combine.ll
new file mode 100644
index 000000000000..af65ade35ce8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/xor-not-combine.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Test for DAG combine: fold (not (sub Y, X)) -> (add X, ~Y)
+; when Y is a constant.
+
+; Test case 1: Y is a constant - should transform to (add X, ~Y)
+define i32 @test_not_sub_constant(i32 %x) {
+; CHECK-LABEL: test_not_sub_constant:
+; CHECK:       # %bb.0:
+; CHECK:         leal -101(%rdi), %eax
+; CHECK-NEXT:    retq
+  %sub = sub i32 100, %x
+  %not = xor i32 %sub, -1
+  ret i32 %not
+}
+
+; Test case 2: Y is not a constant - should NOT optimize
+define i32 @test_not_sub_non_constant(i32 %x, i32 %y) {
+; CHECK-LABEL: test_not_sub_non_constant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    retq
+  %sub = sub i32 %y, %x
+  %not = xor i32 %sub, -1
+  ret i32 %not
+}
author	Mingming Liu <mingmingl@google.com>	2025-09-10 15:25:31 -0700
committer	GitHub <noreply@github.com>	2025-09-10 15:25:31 -0700
commit	1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch)
tree	57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/test/CodeGen/X86
parent	898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff)
parent	b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff)